LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 uint32_t SrcLocStrSize,
835 IdentFlag LocFlags,
836 unsigned Reserve2Flags) {
837 // Enable "C-mode".
838 LocFlags |= OMP_IDENT_FLAG_KMPC;
839
840 Constant *&Ident =
841 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
842 if (!Ident) {
844 Constant *IdentData[] = {I32Null,
845 ConstantInt::get(Int32, uint32_t(LocFlags)),
846 ConstantInt::get(Int32, Reserve2Flags),
847 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
848 Constant *Initializer =
849 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
850
851 // Look for existing encoding of the location + flags, not needed but
852 // minimizes the difference to the existing solution while we transition.
853 for (GlobalVariable &GV : M.globals())
854 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
855 if (GV.getInitializer() == Initializer)
856 Ident = &GV;
857
858 if (!Ident) {
859 auto *GV = new GlobalVariable(
860 M, OpenMPIRBuilder::Ident,
861 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
864 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
865 GV->setAlignment(Align(8));
866 Ident = GV;
867 }
868 }
869
871}
872
874 uint32_t &SrcLocStrSize) {
875 SrcLocStrSize = LocStr.size();
876 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
877 if (!SrcLocStr) {
878 Constant *Initializer =
880
881 // Look for existing encoding of the location, not needed but minimizes the
882 // difference to the existing solution while we transition.
883 for (GlobalVariable &GV : M.globals())
884 if (GV.isConstant() && GV.hasInitializer() &&
885 GV.getInitializer() == Initializer)
886 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
887
888 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
889 /* AddressSpace */ 0, &M);
890 }
891 return SrcLocStr;
892}
893
895 StringRef FileName,
896 unsigned Line, unsigned Column,
897 uint32_t &SrcLocStrSize) {
898 SmallString<128> Buffer;
899 Buffer.push_back(';');
900 Buffer.append(FileName);
901 Buffer.push_back(';');
902 Buffer.append(FunctionName);
903 Buffer.push_back(';');
904 Buffer.append(std::to_string(Line));
905 Buffer.push_back(';');
906 Buffer.append(std::to_string(Column));
907 Buffer.push_back(';');
908 Buffer.push_back(';');
909 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
910}
911
912Constant *
914 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
915 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
916}
917
919 uint32_t &SrcLocStrSize,
920 Function *F) {
921 DILocation *DIL = DL.get();
922 if (!DIL)
923 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
924 StringRef FileName = M.getName();
925 if (DIFile *DIF = DIL->getFile())
926 if (std::optional<StringRef> Source = DIF->getSource())
927 FileName = *Source;
928 StringRef Function = DIL->getScope()->getSubprogram()->getName();
929 if (Function.empty() && F)
930 Function = F->getName();
931 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
932 DIL->getColumn(), SrcLocStrSize);
933}
934
936 uint32_t &SrcLocStrSize) {
937 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
938 Loc.IP.getBlock()->getParent());
939}
940
942 return Builder.CreateCall(
943 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
944 "omp_global_thread_num");
945}
946
949 bool ForceSimpleCall, bool CheckCancelFlag) {
950 if (!updateToLocation(Loc))
951 return Loc.IP;
952
953 // Build call __kmpc_cancel_barrier(loc, thread_id) or
954 // __kmpc_barrier(loc, thread_id);
955
956 IdentFlag BarrierLocFlags;
957 switch (Kind) {
958 case OMPD_for:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
960 break;
961 case OMPD_sections:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
963 break;
964 case OMPD_single:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
966 break;
967 case OMPD_barrier:
968 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
969 break;
970 default:
971 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
972 break;
973 }
974
975 uint32_t SrcLocStrSize;
976 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
977 Value *Args[] = {
978 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
979 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
980
981 // If we are in a cancellable parallel region, barriers are cancellation
982 // points.
983 // TODO: Check why we would force simple calls or to ignore the cancel flag.
984 bool UseCancelBarrier =
985 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
986
987 Value *Result =
989 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
990 : OMPRTL___kmpc_barrier),
991 Args);
992
993 if (UseCancelBarrier && CheckCancelFlag)
994 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
995 return Err;
996
997 return Builder.saveIP();
998}
999
1002 Value *IfCondition,
1003 omp::Directive CanceledDirective) {
1004 if (!updateToLocation(Loc))
1005 return Loc.IP;
1006
1007 // LLVM utilities like blocks with terminators.
1008 auto *UI = Builder.CreateUnreachable();
1009
1010 Instruction *ThenTI = UI, *ElseTI = nullptr;
1011 if (IfCondition)
1012 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1013 Builder.SetInsertPoint(ThenTI);
1014
1015 Value *CancelKind = nullptr;
1016 switch (CanceledDirective) {
1017#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1018 case DirectiveEnum: \
1019 CancelKind = Builder.getInt32(Value); \
1020 break;
1021#include "llvm/Frontend/OpenMP/OMPKinds.def"
1022 default:
1023 llvm_unreachable("Unknown cancel kind!");
1024 }
1025
1026 uint32_t SrcLocStrSize;
1027 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1028 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1029 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1030 Value *Result = Builder.CreateCall(
1031 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1032 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1033 if (CanceledDirective == OMPD_parallel) {
1035 Builder.restoreIP(IP);
1037 omp::Directive::OMPD_unknown,
1038 /* ForceSimpleCall */ false,
1039 /* CheckCancelFlag */ false)
1040 .takeError();
1041 }
1042 return Error::success();
1043 };
1044
1045 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1046 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1047 return Err;
1048
1049 // Update the insertion point and remove the terminator we introduced.
1050 Builder.SetInsertPoint(UI->getParent());
1051 UI->eraseFromParent();
1052
1053 return Builder.saveIP();
1054}
1055
1057 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1058 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1059 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1060 if (!updateToLocation(Loc))
1061 return Loc.IP;
1062
1063 Builder.restoreIP(AllocaIP);
1064 auto *KernelArgsPtr =
1065 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1066 Builder.restoreIP(Loc.IP);
1067
1068 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1069 llvm::Value *Arg =
1070 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1072 KernelArgs[I], Arg,
1073 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1074 }
1075
1076 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1077 NumThreads, HostPtr, KernelArgsPtr};
1078
1079 Return = Builder.CreateCall(
1080 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1081 OffloadingArgs);
1082
1083 return Builder.saveIP();
1084}
1085
1087 const LocationDescription &Loc, Value *OutlinedFnID,
1088 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1089 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1090
1091 if (!updateToLocation(Loc))
1092 return Loc.IP;
1093
1094 Builder.restoreIP(Loc.IP);
1095 // On top of the arrays that were filled up, the target offloading call
1096 // takes as arguments the device id as well as the host pointer. The host
1097 // pointer is used by the runtime library to identify the current target
1098 // region, so it only has to be unique and not necessarily point to
1099 // anything. It could be the pointer to the outlined function that
1100 // implements the target region, but we aren't using that so that the
1101 // compiler doesn't need to keep that, and could therefore inline the host
1102 // function if proven worthwhile during optimization.
1103
1104 // From this point on, we need to have an ID of the target region defined.
1105 assert(OutlinedFnID && "Invalid outlined function ID!");
1106 (void)OutlinedFnID;
1107
1108 // Return value of the runtime offloading call.
1109 Value *Return = nullptr;
1110
1111 // Arguments for the target kernel.
1112 SmallVector<Value *> ArgsVector;
1113 getKernelArgsVector(Args, Builder, ArgsVector);
1114
1115 // The target region is an outlined function launched by the runtime
1116 // via calls to __tgt_target_kernel().
1117 //
1118 // Note that on the host and CPU targets, the runtime implementation of
1119 // these calls simply call the outlined function without forking threads.
1120 // The outlined functions themselves have runtime calls to
1121 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1122 // the compiler in emitTeamsCall() and emitParallelCall().
1123 //
1124 // In contrast, on the NVPTX target, the implementation of
1125 // __tgt_target_teams() launches a GPU kernel with the requested number
1126 // of teams and threads so no additional calls to the runtime are required.
1127 // Check the error code and execute the host version if required.
1129 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1130 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1131
1132 BasicBlock *OffloadFailedBlock =
1133 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1134 BasicBlock *OffloadContBlock =
1135 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1137 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1138
1139 auto CurFn = Builder.GetInsertBlock()->getParent();
1140 emitBlock(OffloadFailedBlock, CurFn);
1141 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1142 if (!AfterIP)
1143 return AfterIP.takeError();
1144 Builder.restoreIP(*AfterIP);
1145 emitBranch(OffloadContBlock);
1146 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1147 return Builder.saveIP();
1148}
1149
1151 Value *CancelFlag, omp::Directive CanceledDirective,
1152 FinalizeCallbackTy ExitCB) {
1153 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1154 "Unexpected cancellation!");
1155
1156 // For a cancel barrier we create two new blocks.
1158 BasicBlock *NonCancellationBlock;
1159 if (Builder.GetInsertPoint() == BB->end()) {
1160 // TODO: This branch will not be needed once we moved to the
1161 // OpenMPIRBuilder codegen completely.
1162 NonCancellationBlock = BasicBlock::Create(
1163 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1164 } else {
1165 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1168 }
1169 BasicBlock *CancellationBlock = BasicBlock::Create(
1170 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1171
1172 // Jump to them based on the return value.
1173 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1174 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1175 /* TODO weight */ nullptr, nullptr);
1176
1177 // From the cancellation block we finalize all variables and go to the
1178 // post finalization block that is known to the FiniCB callback.
1179 Builder.SetInsertPoint(CancellationBlock);
1180 if (ExitCB)
1181 if (Error Err = ExitCB(Builder.saveIP()))
1182 return Err;
1183 auto &FI = FinalizationStack.back();
1184 if (Error Err = FI.FiniCB(Builder.saveIP()))
1185 return Err;
1186
1187 // The continuation block is where code generation continues.
1188 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1189 return Error::success();
1190}
1191
1192// Callback used to create OpenMP runtime calls to support
1193// omp parallel clause for the device.
1194// We need to use this callback to replace call to the OutlinedFn in OuterFn
1195// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1197 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1198 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1199 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1200 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1201 // Add some known attributes.
1202 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1203 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1204 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1205 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1206 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1207 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1208
1209 assert(OutlinedFn.arg_size() >= 2 &&
1210 "Expected at least tid and bounded tid as arguments");
1211 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1212
1213 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1214 assert(CI && "Expected call instruction to outlined function");
1215 CI->getParent()->setName("omp_parallel");
1216
1217 Builder.SetInsertPoint(CI);
1218 Type *PtrTy = OMPIRBuilder->VoidPtr;
1219 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1220
1221 // Add alloca for kernel args
1222 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1223 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1224 AllocaInst *ArgsAlloca =
1225 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1226 Value *Args = ArgsAlloca;
1227 // Add address space cast if array for storing arguments is not allocated
1228 // in address space 0
1229 if (ArgsAlloca->getAddressSpace())
1230 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1231 Builder.restoreIP(CurrentIP);
1232
1233 // Store captured vars which are used by kmpc_parallel_51
1234 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1235 Value *V = *(CI->arg_begin() + 2 + Idx);
1236 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1237 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1238 Builder.CreateStore(V, StoreAddress);
1239 }
1240
1241 Value *Cond =
1242 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1243 : Builder.getInt32(1);
1244
1245 // Build kmpc_parallel_51 call
1246 Value *Parallel51CallArgs[] = {
1247 /* identifier*/ Ident,
1248 /* global thread num*/ ThreadID,
1249 /* if expression */ Cond,
1250 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1251 /* Proc bind */ Builder.getInt32(-1),
1252 /* outlined function */
1253 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1254 /* wrapper function */ NullPtrValue,
1255 /* arguments of the outlined funciton*/ Args,
1256 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1257
1258 FunctionCallee RTLFn =
1259 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1260
1261 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1262
1263 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1264 << *Builder.GetInsertBlock()->getParent() << "\n");
1265
1266 // Initialize the local TID stack location with the argument value.
1267 Builder.SetInsertPoint(PrivTID);
1268 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1269 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1270 PrivTIDAddr);
1271
1272 // Remove redundant call to the outlined function.
1273 CI->eraseFromParent();
1274
1275 for (Instruction *I : ToBeDeleted) {
1276 I->eraseFromParent();
1277 }
1278}
1279
1280// Callback used to create OpenMP runtime calls to support
1281// omp parallel clause for the host.
1282// We need to use this callback to replace call to the OutlinedFn in OuterFn
1283// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1284static void
1286 Function *OuterFn, Value *Ident, Value *IfCondition,
1287 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1288 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1289 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1290 FunctionCallee RTLFn;
1291 if (IfCondition) {
1292 RTLFn =
1293 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1294 } else {
1295 RTLFn =
1296 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1297 }
1298 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1299 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1300 LLVMContext &Ctx = F->getContext();
1301 MDBuilder MDB(Ctx);
1302 // Annotate the callback behavior of the __kmpc_fork_call:
1303 // - The callback callee is argument number 2 (microtask).
1304 // - The first two arguments of the callback callee are unknown (-1).
1305 // - All variadic arguments to the __kmpc_fork_call are passed to the
1306 // callback callee.
1307 F->addMetadata(LLVMContext::MD_callback,
1309 2, {-1, -1},
1310 /* VarArgsArePassed */ true)}));
1311 }
1312 }
1313 // Add some known attributes.
1314 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1315 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1316 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1317
1318 assert(OutlinedFn.arg_size() >= 2 &&
1319 "Expected at least tid and bounded tid as arguments");
1320 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1321
1322 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1323 CI->getParent()->setName("omp_parallel");
1324 Builder.SetInsertPoint(CI);
1325
1326 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1327 Value *ForkCallArgs[] = {
1328 Ident, Builder.getInt32(NumCapturedVars),
1329 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1330
1331 SmallVector<Value *, 16> RealArgs;
1332 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1333 if (IfCondition) {
1334 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1335 RealArgs.push_back(Cond);
1336 }
1337 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1338
1339 // __kmpc_fork_call_if always expects a void ptr as the last argument
1340 // If there are no arguments, pass a null pointer.
1341 auto PtrTy = OMPIRBuilder->VoidPtr;
1342 if (IfCondition && NumCapturedVars == 0) {
1343 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1344 RealArgs.push_back(NullPtrValue);
1345 }
1346 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1347 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1348
1349 Builder.CreateCall(RTLFn, RealArgs);
1350
1351 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1352 << *Builder.GetInsertBlock()->getParent() << "\n");
1353
1354 // Initialize the local TID stack location with the argument value.
1355 Builder.SetInsertPoint(PrivTID);
1356 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1357 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1358 PrivTIDAddr);
1359
1360 // Remove redundant call to the outlined function.
1361 CI->eraseFromParent();
1362
1363 for (Instruction *I : ToBeDeleted) {
1364 I->eraseFromParent();
1365 }
1366}
1367
1369 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1370 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1371 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1372 omp::ProcBindKind ProcBind, bool IsCancellable) {
1373 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1374
1375 if (!updateToLocation(Loc))
1376 return Loc.IP;
1377
1378 uint32_t SrcLocStrSize;
1379 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1380 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1381 Value *ThreadID = getOrCreateThreadID(Ident);
1382 // If we generate code for the target device, we need to allocate
1383 // struct for aggregate params in the device default alloca address space.
1384 // OpenMP runtime requires that the params of the extracted functions are
1385 // passed as zero address space pointers. This flag ensures that extracted
1386 // function arguments are declared in zero address space
1387 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1388
1389 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1390 // only if we compile for host side.
1391 if (NumThreads && !Config.isTargetDevice()) {
1392 Value *Args[] = {
1393 Ident, ThreadID,
1394 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1396 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1397 }
1398
1399 if (ProcBind != OMP_PROC_BIND_default) {
1400 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1401 Value *Args[] = {
1402 Ident, ThreadID,
1403 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1405 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1406 }
1407
1408 BasicBlock *InsertBB = Builder.GetInsertBlock();
1409 Function *OuterFn = InsertBB->getParent();
1410
1411 // Save the outer alloca block because the insertion iterator may get
1412 // invalidated and we still need this later.
1413 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1414
1415 // Vector to remember instructions we used only during the modeling but which
1416 // we want to delete at the end.
1418
1419 // Change the location to the outer alloca insertion point to create and
1420 // initialize the allocas we pass into the parallel region.
1421 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1422 Builder.restoreIP(NewOuter);
1423 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1424 AllocaInst *ZeroAddrAlloca =
1425 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1426 Instruction *TIDAddr = TIDAddrAlloca;
1427 Instruction *ZeroAddr = ZeroAddrAlloca;
1428 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1429 // Add additional casts to enforce pointers in zero address space
1430 TIDAddr = new AddrSpaceCastInst(
1431 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1432 TIDAddr->insertAfter(TIDAddrAlloca);
1433 ToBeDeleted.push_back(TIDAddr);
1434 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1435 PointerType ::get(M.getContext(), 0),
1436 "zero.addr.ascast");
1437 ZeroAddr->insertAfter(ZeroAddrAlloca);
1438 ToBeDeleted.push_back(ZeroAddr);
1439 }
1440
1441 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1442 // associated arguments in the outlined function, so we delete them later.
1443 ToBeDeleted.push_back(TIDAddrAlloca);
1444 ToBeDeleted.push_back(ZeroAddrAlloca);
1445
1446 // Create an artificial insertion point that will also ensure the blocks we
1447 // are about to split are not degenerated.
1448 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1449
1450 BasicBlock *EntryBB = UI->getParent();
1451 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1452 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1453 BasicBlock *PRegPreFiniBB =
1454 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1455 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1456
1457 auto FiniCBWrapper = [&](InsertPointTy IP) {
1458 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1459 // target to the region exit block.
1460 if (IP.getBlock()->end() == IP.getPoint()) {
1462 Builder.restoreIP(IP);
1463 Instruction *I = Builder.CreateBr(PRegExitBB);
1464 IP = InsertPointTy(I->getParent(), I->getIterator());
1465 }
1466 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1467 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1468 "Unexpected insertion point for finalization call!");
1469 return FiniCB(IP);
1470 };
1471
1472 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1473
1474 // Generate the privatization allocas in the block that will become the entry
1475 // of the outlined function.
1476 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1477 InsertPointTy InnerAllocaIP = Builder.saveIP();
1478
1479 AllocaInst *PrivTIDAddr =
1480 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1481 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1482
1483 // Add some fake uses for OpenMP provided arguments.
1484 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1485 Instruction *ZeroAddrUse =
1486 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1487 ToBeDeleted.push_back(ZeroAddrUse);
1488
1489 // EntryBB
1490 // |
1491 // V
1492 // PRegionEntryBB <- Privatization allocas are placed here.
1493 // |
1494 // V
1495 // PRegionBodyBB <- BodeGen is invoked here.
1496 // |
1497 // V
1498 // PRegPreFiniBB <- The block we will start finalization from.
1499 // |
1500 // V
1501 // PRegionExitBB <- A common exit to simplify block collection.
1502 //
1503
1504 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1505
1506 // Let the caller create the body.
1507 assert(BodyGenCB && "Expected body generation callback!");
1508 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1509 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1510 return Err;
1511
1512 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1513
1514 OutlineInfo OI;
1515 if (Config.isTargetDevice()) {
1516 // Generate OpenMP target specific runtime call
1517 OI.PostOutlineCB = [=, ToBeDeletedVec =
1518 std::move(ToBeDeleted)](Function &OutlinedFn) {
1519 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1520 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1521 ThreadID, ToBeDeletedVec);
1522 };
1523 } else {
1524 // Generate OpenMP host runtime call
1525 OI.PostOutlineCB = [=, ToBeDeletedVec =
1526 std::move(ToBeDeleted)](Function &OutlinedFn) {
1527 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1528 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1529 };
1530 }
1531
1532 OI.OuterAllocaBB = OuterAllocaBlock;
1533 OI.EntryBB = PRegEntryBB;
1534 OI.ExitBB = PRegExitBB;
1535
1536 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1538 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1539
1540 // Ensure a single exit node for the outlined region by creating one.
1541 // We might have multiple incoming edges to the exit now due to finalizations,
1542 // e.g., cancel calls that cause the control flow to leave the region.
1543 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1544 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1545 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1546 Blocks.push_back(PRegOutlinedExitBB);
1547
1548 CodeExtractorAnalysisCache CEAC(*OuterFn);
1549 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1550 /* AggregateArgs */ false,
1551 /* BlockFrequencyInfo */ nullptr,
1552 /* BranchProbabilityInfo */ nullptr,
1553 /* AssumptionCache */ nullptr,
1554 /* AllowVarArgs */ true,
1555 /* AllowAlloca */ true,
1556 /* AllocationBlock */ OuterAllocaBlock,
1557 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1558
1559 // Find inputs to, outputs from the code region.
1560 BasicBlock *CommonExit = nullptr;
1561 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1562 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1563
1564 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1565 /*CollectGlobalInputs=*/true);
1566
1567 Inputs.remove_if([&](Value *I) {
1568 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1569 return GV->getValueType() == OpenMPIRBuilder::Ident;
1570
1571 return false;
1572 });
1573
1574 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1575
1576 FunctionCallee TIDRTLFn =
1577 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1578
1579 auto PrivHelper = [&](Value &V) -> Error {
1580 if (&V == TIDAddr || &V == ZeroAddr) {
1581 OI.ExcludeArgsFromAggregate.push_back(&V);
1582 return Error::success();
1583 }
1584
1586 for (Use &U : V.uses())
1587 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1588 if (ParallelRegionBlockSet.count(UserI->getParent()))
1589 Uses.insert(&U);
1590
1591 // __kmpc_fork_call expects extra arguments as pointers. If the input
1592 // already has a pointer type, everything is fine. Otherwise, store the
1593 // value onto stack and load it back inside the to-be-outlined region. This
1594 // will ensure only the pointer will be passed to the function.
1595 // FIXME: if there are more than 15 trailing arguments, they must be
1596 // additionally packed in a struct.
1597 Value *Inner = &V;
1598 if (!V.getType()->isPointerTy()) {
1600 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1601
1602 Builder.restoreIP(OuterAllocaIP);
1603 Value *Ptr =
1604 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1605
1606 // Store to stack at end of the block that currently branches to the entry
1607 // block of the to-be-outlined region.
1608 Builder.SetInsertPoint(InsertBB,
1609 InsertBB->getTerminator()->getIterator());
1610 Builder.CreateStore(&V, Ptr);
1611
1612 // Load back next to allocations in the to-be-outlined region.
1613 Builder.restoreIP(InnerAllocaIP);
1614 Inner = Builder.CreateLoad(V.getType(), Ptr);
1615 }
1616
1617 Value *ReplacementValue = nullptr;
1618 CallInst *CI = dyn_cast<CallInst>(&V);
1619 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1620 ReplacementValue = PrivTID;
1621 } else {
1622 InsertPointOrErrorTy AfterIP =
1623 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1624 if (!AfterIP)
1625 return AfterIP.takeError();
1626 Builder.restoreIP(*AfterIP);
1627 InnerAllocaIP = {
1628 InnerAllocaIP.getBlock(),
1629 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1630
1631 assert(ReplacementValue &&
1632 "Expected copy/create callback to set replacement value!");
1633 if (ReplacementValue == &V)
1634 return Error::success();
1635 }
1636
1637 for (Use *UPtr : Uses)
1638 UPtr->set(ReplacementValue);
1639
1640 return Error::success();
1641 };
1642
1643 // Reset the inner alloca insertion as it will be used for loading the values
1644 // wrapped into pointers before passing them into the to-be-outlined region.
1645 // Configure it to insert immediately after the fake use of zero address so
1646 // that they are available in the generated body and so that the
1647 // OpenMP-related values (thread ID and zero address pointers) remain leading
1648 // in the argument list.
1649 InnerAllocaIP = IRBuilder<>::InsertPoint(
1650 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1651
1652 // Reset the outer alloca insertion point to the entry of the relevant block
1653 // in case it was invalidated.
1654 OuterAllocaIP = IRBuilder<>::InsertPoint(
1655 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1656
1657 for (Value *Input : Inputs) {
1658 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1659 if (Error Err = PrivHelper(*Input))
1660 return Err;
1661 }
1662 LLVM_DEBUG({
1663 for (Value *Output : Outputs)
1664 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1665 });
1666 assert(Outputs.empty() &&
1667 "OpenMP outlining should not produce live-out values!");
1668
1669 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1670 LLVM_DEBUG({
1671 for (auto *BB : Blocks)
1672 dbgs() << " PBR: " << BB->getName() << "\n";
1673 });
1674
1675 // Adjust the finalization stack, verify the adjustment, and call the
1676 // finalize function a last time to finalize values between the pre-fini
1677 // block and the exit block if we left the parallel "the normal way".
1678 auto FiniInfo = FinalizationStack.pop_back_val();
1679 (void)FiniInfo;
1680 assert(FiniInfo.DK == OMPD_parallel &&
1681 "Unexpected finalization stack state!");
1682
1683 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1684
1685 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1686 if (Error Err = FiniCB(PreFiniIP))
1687 return Err;
1688
1689 // Register the outlined info.
1690 addOutlineInfo(std::move(OI));
1691
1692 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1693 UI->eraseFromParent();
1694
1695 return AfterIP;
1696}
1697
1699 // Build call void __kmpc_flush(ident_t *loc)
1700 uint32_t SrcLocStrSize;
1701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1702 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1703
1704 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1705}
1706
1708 if (!updateToLocation(Loc))
1709 return;
1710 emitFlush(Loc);
1711}
1712
1714 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1715 // global_tid);
1716 uint32_t SrcLocStrSize;
1717 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1718 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1719 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1720
1721 // Ignore return result until untied tasks are supported.
1722 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1723 Args);
1724}
1725
1727 if (!updateToLocation(Loc))
1728 return;
1729 emitTaskwaitImpl(Loc);
1730}
1731
1733 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1734 uint32_t SrcLocStrSize;
1735 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1736 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1738 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1739
1740 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1741 Args);
1742}
1743
1745 if (!updateToLocation(Loc))
1746 return;
1747 emitTaskyieldImpl(Loc);
1748}
1749
1750// Processes the dependencies in Dependencies and does the following
1751// - Allocates space on the stack of an array of DependInfo objects
1752// - Populates each DependInfo object with relevant information of
1753// the corresponding dependence.
1754// - All code is inserted in the entry block of the current function.
1756 OpenMPIRBuilder &OMPBuilder,
1758 // Early return if we have no dependencies to process
1759 if (Dependencies.empty())
1760 return nullptr;
1761
1762 // Given a vector of DependData objects, in this function we create an
1763 // array on the stack that holds kmp_dep_info objects corresponding
1764 // to each dependency. This is then passed to the OpenMP runtime.
1765 // For example, if there are 'n' dependencies then the following psedo
1766 // code is generated. Assume the first dependence is on a variable 'a'
1767 //
1768 // \code{c}
1769 // DepArray = alloc(n x sizeof(kmp_depend_info);
1770 // idx = 0;
1771 // DepArray[idx].base_addr = ptrtoint(&a);
1772 // DepArray[idx].len = 8;
1773 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1774 // ++idx;
1775 // DepArray[idx].base_addr = ...;
1776 // \endcode
1777
1778 IRBuilderBase &Builder = OMPBuilder.Builder;
1779 Type *DependInfo = OMPBuilder.DependInfo;
1780 Module &M = OMPBuilder.M;
1781
1782 Value *DepArray = nullptr;
1783 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1784 Builder.SetInsertPoint(
1786
1787 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1788 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1789
1790 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1791 Value *Base =
1792 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1793 // Store the pointer to the variable
1794 Value *Addr = Builder.CreateStructGEP(
1795 DependInfo, Base,
1796 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1797 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1798 Builder.CreateStore(DepValPtr, Addr);
1799 // Store the size of the variable
1800 Value *Size = Builder.CreateStructGEP(
1801 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1802 Builder.CreateStore(
1803 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1804 Size);
1805 // Store the dependency kind
1806 Value *Flags = Builder.CreateStructGEP(
1807 DependInfo, Base,
1808 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1809 Builder.CreateStore(
1810 ConstantInt::get(Builder.getInt8Ty(),
1811 static_cast<unsigned int>(Dep.DepKind)),
1812 Flags);
1813 }
1814 Builder.restoreIP(OldIP);
1815 return DepArray;
1816}
1817
1819 const LocationDescription &Loc, InsertPointTy AllocaIP,
1820 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1821 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle) {
1822
1823 if (!updateToLocation(Loc))
1824 return InsertPointTy();
1825
1826 uint32_t SrcLocStrSize;
1827 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1828 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1829 // The current basic block is split into four basic blocks. After outlining,
1830 // they will be mapped as follows:
1831 // ```
1832 // def current_fn() {
1833 // current_basic_block:
1834 // br label %task.exit
1835 // task.exit:
1836 // ; instructions after task
1837 // }
1838 // def outlined_fn() {
1839 // task.alloca:
1840 // br label %task.body
1841 // task.body:
1842 // ret void
1843 // }
1844 // ```
1845 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1846 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1847 BasicBlock *TaskAllocaBB =
1848 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1849
1850 InsertPointTy TaskAllocaIP =
1851 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1852 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1853 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1854 return Err;
1855
1856 OutlineInfo OI;
1857 OI.EntryBB = TaskAllocaBB;
1858 OI.OuterAllocaBB = AllocaIP.getBlock();
1859 OI.ExitBB = TaskExitBB;
1860
1861 // Add the thread ID argument.
1864 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1865
1866 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1867 Mergeable, EventHandle, TaskAllocaBB,
1868 ToBeDeleted](Function &OutlinedFn) mutable {
1869 // Replace the Stale CI by appropriate RTL function call.
1870 assert(OutlinedFn.getNumUses() == 1 &&
1871 "there must be a single user for the outlined function");
1872 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1873
1874 // HasShareds is true if any variables are captured in the outlined region,
1875 // false otherwise.
1876 bool HasShareds = StaleCI->arg_size() > 1;
1877 Builder.SetInsertPoint(StaleCI);
1878
1879 // Gather the arguments for emitting the runtime call for
1880 // @__kmpc_omp_task_alloc
1881 Function *TaskAllocFn =
1882 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1883
1884 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1885 // call.
1886 Value *ThreadID = getOrCreateThreadID(Ident);
1887
1888 // Argument - `flags`
1889 // Task is tied iff (Flags & 1) == 1.
1890 // Task is untied iff (Flags & 1) == 0.
1891 // Task is final iff (Flags & 2) == 2.
1892 // Task is not final iff (Flags & 2) == 0.
1893 // Task is mergeable iff (Flags & 4) == 4.
1894 // Task is not mergeable iff (Flags & 4) == 0.
1895 // TODO: Handle the other flags.
1896 Value *Flags = Builder.getInt32(Tied);
1897 if (Final) {
1898 Value *FinalFlag =
1900 Flags = Builder.CreateOr(FinalFlag, Flags);
1901 }
1902
1903 if (Mergeable)
1905
1906 // Argument - `sizeof_kmp_task_t` (TaskSize)
1907 // Tasksize refers to the size in bytes of kmp_task_t data structure
1908 // including private vars accessed in task.
1909 // TODO: add kmp_task_t_with_privates (privates)
1910 Value *TaskSize = Builder.getInt64(
1912
1913 // Argument - `sizeof_shareds` (SharedsSize)
1914 // SharedsSize refers to the shareds array size in the kmp_task_t data
1915 // structure.
1916 Value *SharedsSize = Builder.getInt64(0);
1917 if (HasShareds) {
1918 AllocaInst *ArgStructAlloca =
1919 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1920 assert(ArgStructAlloca &&
1921 "Unable to find the alloca instruction corresponding to arguments "
1922 "for extracted function");
1923 StructType *ArgStructType =
1924 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1925 assert(ArgStructType && "Unable to find struct type corresponding to "
1926 "arguments for extracted function");
1927 SharedsSize =
1929 }
1930 // Emit the @__kmpc_omp_task_alloc runtime call
1931 // The runtime call returns a pointer to an area where the task captured
1932 // variables must be copied before the task is run (TaskData)
1933 CallInst *TaskData = Builder.CreateCall(
1934 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1935 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1936 /*task_func=*/&OutlinedFn});
1937
1938 // Emit detach clause initialization.
1939 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
1940 // task_descriptor);
1941 if (EventHandle) {
1943 OMPRTL___kmpc_task_allow_completion_event);
1944 llvm::Value *EventVal =
1945 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
1946 llvm::Value *EventHandleAddr =
1948 Builder.getPtrTy(0));
1949 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
1950 Builder.CreateStore(EventVal, EventHandleAddr);
1951 }
1952 // Copy the arguments for outlined function
1953 if (HasShareds) {
1954 Value *Shareds = StaleCI->getArgOperand(1);
1955 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1956 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1957 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1958 SharedsSize);
1959 }
1960
1961 Value *DepArray = nullptr;
1962 if (Dependencies.size()) {
1963 InsertPointTy OldIP = Builder.saveIP();
1965 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1966
1967 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1968 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1969
1970 unsigned P = 0;
1971 for (const DependData &Dep : Dependencies) {
1972 Value *Base =
1973 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1974 // Store the pointer to the variable
1976 DependInfo, Base,
1977 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1978 Value *DepValPtr =
1980 Builder.CreateStore(DepValPtr, Addr);
1981 // Store the size of the variable
1983 DependInfo, Base,
1984 static_cast<unsigned int>(RTLDependInfoFields::Len));
1986 Dep.DepValueType)),
1987 Size);
1988 // Store the dependency kind
1990 DependInfo, Base,
1991 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1993 ConstantInt::get(Builder.getInt8Ty(),
1994 static_cast<unsigned int>(Dep.DepKind)),
1995 Flags);
1996 ++P;
1997 }
1998
1999 Builder.restoreIP(OldIP);
2000 }
2001
2002 // In the presence of the `if` clause, the following IR is generated:
2003 // ...
2004 // %data = call @__kmpc_omp_task_alloc(...)
2005 // br i1 %if_condition, label %then, label %else
2006 // then:
2007 // call @__kmpc_omp_task(...)
2008 // br label %exit
2009 // else:
2010 // ;; Wait for resolution of dependencies, if any, before
2011 // ;; beginning the task
2012 // call @__kmpc_omp_wait_deps(...)
2013 // call @__kmpc_omp_task_begin_if0(...)
2014 // call @outlined_fn(...)
2015 // call @__kmpc_omp_task_complete_if0(...)
2016 // br label %exit
2017 // exit:
2018 // ...
2019 if (IfCondition) {
2020 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2021 // terminator.
2022 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2023 Instruction *IfTerminator =
2024 Builder.GetInsertPoint()->getParent()->getTerminator();
2025 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2026 Builder.SetInsertPoint(IfTerminator);
2027 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2028 &ElseTI);
2029 Builder.SetInsertPoint(ElseTI);
2030
2031 if (Dependencies.size()) {
2032 Function *TaskWaitFn =
2033 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2035 TaskWaitFn,
2036 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2037 ConstantInt::get(Builder.getInt32Ty(), 0),
2039 }
2040 Function *TaskBeginFn =
2041 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2042 Function *TaskCompleteFn =
2043 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2044 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2045 CallInst *CI = nullptr;
2046 if (HasShareds)
2047 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2048 else
2049 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2050 CI->setDebugLoc(StaleCI->getDebugLoc());
2051 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2052 Builder.SetInsertPoint(ThenTI);
2053 }
2054
2055 if (Dependencies.size()) {
2056 Function *TaskFn =
2057 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2059 TaskFn,
2060 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2061 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2063
2064 } else {
2065 // Emit the @__kmpc_omp_task runtime call to spawn the task
2066 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2067 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2068 }
2069
2070 StaleCI->eraseFromParent();
2071
2072 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2073 if (HasShareds) {
2074 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2075 OutlinedFn.getArg(1)->replaceUsesWithIf(
2076 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2077 }
2078
2079 for (Instruction *I : llvm::reverse(ToBeDeleted))
2080 I->eraseFromParent();
2081 };
2082
2083 addOutlineInfo(std::move(OI));
2084 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2085
2086 return Builder.saveIP();
2087}
2088
2091 InsertPointTy AllocaIP,
2092 BodyGenCallbackTy BodyGenCB) {
2093 if (!updateToLocation(Loc))
2094 return InsertPointTy();
2095
2096 uint32_t SrcLocStrSize;
2097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2099 Value *ThreadID = getOrCreateThreadID(Ident);
2100
2101 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2102 Function *TaskgroupFn =
2103 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2104 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2105
2106 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2107 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2108 return Err;
2109
2110 Builder.SetInsertPoint(TaskgroupExitBB);
2111 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2112 Function *EndTaskgroupFn =
2113 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2114 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2115
2116 return Builder.saveIP();
2117}
2118
2120 const LocationDescription &Loc, InsertPointTy AllocaIP,
2122 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2123 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2124
2125 if (!updateToLocation(Loc))
2126 return Loc.IP;
2127
2128 auto FiniCBWrapper = [&](InsertPointTy IP) {
2129 if (IP.getBlock()->end() != IP.getPoint())
2130 return FiniCB(IP);
2131 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2132 // will fail because that function requires the Finalization Basic Block to
2133 // have a terminator, which is already removed by EmitOMPRegionBody.
2134 // IP is currently at cancelation block.
2135 // We need to backtrack to the condition block to fetch
2136 // the exit block and create a branch from cancelation
2137 // to exit block.
2139 Builder.restoreIP(IP);
2140 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2141 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2142 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2143 Instruction *I = Builder.CreateBr(ExitBB);
2144 IP = InsertPointTy(I->getParent(), I->getIterator());
2145 return FiniCB(IP);
2146 };
2147
2148 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2149
2150 // Each section is emitted as a switch case
2151 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2152 // -> OMP.createSection() which generates the IR for each section
2153 // Iterate through all sections and emit a switch construct:
2154 // switch (IV) {
2155 // case 0:
2156 // <SectionStmt[0]>;
2157 // break;
2158 // ...
2159 // case <NumSection> - 1:
2160 // <SectionStmt[<NumSection> - 1]>;
2161 // break;
2162 // }
2163 // ...
2164 // section_loop.after:
2165 // <FiniCB>;
2166 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2167 Builder.restoreIP(CodeGenIP);
2169 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2170 Function *CurFn = Continue->getParent();
2171 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2172
2173 unsigned CaseNumber = 0;
2174 for (auto SectionCB : SectionCBs) {
2176 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2177 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2178 Builder.SetInsertPoint(CaseBB);
2179 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2180 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2181 CaseEndBr->getIterator()}))
2182 return Err;
2183 CaseNumber++;
2184 }
2185 // remove the existing terminator from body BB since there can be no
2186 // terminators after switch/case
2187 return Error::success();
2188 };
2189 // Loop body ends here
2190 // LowerBound, UpperBound, and STride for createCanonicalLoop
2191 Type *I32Ty = Type::getInt32Ty(M.getContext());
2192 Value *LB = ConstantInt::get(I32Ty, 0);
2193 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2194 Value *ST = ConstantInt::get(I32Ty, 1);
2196 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2197 if (!LoopInfo)
2198 return LoopInfo.takeError();
2199
2200 InsertPointOrErrorTy WsloopIP =
2201 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2202 if (!WsloopIP)
2203 return WsloopIP.takeError();
2204 InsertPointTy AfterIP = *WsloopIP;
2205
2206 // Apply the finalization callback in LoopAfterBB
2207 auto FiniInfo = FinalizationStack.pop_back_val();
2208 assert(FiniInfo.DK == OMPD_sections &&
2209 "Unexpected finalization stack state!");
2210 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2211 Builder.restoreIP(AfterIP);
2212 BasicBlock *FiniBB =
2213 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2214 if (Error Err = CB(Builder.saveIP()))
2215 return Err;
2216 AfterIP = {FiniBB, FiniBB->begin()};
2217 }
2218
2219 return AfterIP;
2220}
2221
2224 BodyGenCallbackTy BodyGenCB,
2225 FinalizeCallbackTy FiniCB) {
2226 if (!updateToLocation(Loc))
2227 return Loc.IP;
2228
2229 auto FiniCBWrapper = [&](InsertPointTy IP) {
2230 if (IP.getBlock()->end() != IP.getPoint())
2231 return FiniCB(IP);
2232 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2233 // will fail because that function requires the Finalization Basic Block to
2234 // have a terminator, which is already removed by EmitOMPRegionBody.
2235 // IP is currently at cancelation block.
2236 // We need to backtrack to the condition block to fetch
2237 // the exit block and create a branch from cancelation
2238 // to exit block.
2240 Builder.restoreIP(IP);
2241 auto *CaseBB = Loc.IP.getBlock();
2242 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2243 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2244 Instruction *I = Builder.CreateBr(ExitBB);
2245 IP = InsertPointTy(I->getParent(), I->getIterator());
2246 return FiniCB(IP);
2247 };
2248
2249 Directive OMPD = Directive::OMPD_sections;
2250 // Since we are using Finalization Callback here, HasFinalize
2251 // and IsCancellable have to be true
2252 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2253 /*Conditional*/ false, /*hasFinalize*/ true,
2254 /*IsCancellable*/ true);
2255}
2256
2259 IT++;
2260 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2261}
2262
2263void OpenMPIRBuilder::emitUsed(StringRef Name,
2264 std::vector<WeakTrackingVH> &List) {
2265 if (List.empty())
2266 return;
2267
2268 // Convert List to what ConstantArray needs.
2270 UsedArray.resize(List.size());
2271 for (unsigned I = 0, E = List.size(); I != E; ++I)
2273 cast<Constant>(&*List[I]), Builder.getPtrTy());
2274
2275 if (UsedArray.empty())
2276 return;
2277 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2278
2279 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2280 ConstantArray::get(ATy, UsedArray), Name);
2281
2282 GV->setSection("llvm.metadata");
2283}
2284
2285Value *OpenMPIRBuilder::getGPUThreadID() {
2286 return Builder.CreateCall(
2288 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2289 {});
2290}
2291
2292Value *OpenMPIRBuilder::getGPUWarpSize() {
2293 return Builder.CreateCall(
2294 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2295}
2296
2297Value *OpenMPIRBuilder::getNVPTXWarpID() {
2298 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2299 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2300}
2301
2302Value *OpenMPIRBuilder::getNVPTXLaneID() {
2303 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2304 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2305 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2306 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2307 "nvptx_lane_id");
2308}
2309
2310Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2311 Type *ToType) {
2312 Type *FromType = From->getType();
2313 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2314 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2315 assert(FromSize > 0 && "From size must be greater than zero");
2316 assert(ToSize > 0 && "To size must be greater than zero");
2317 if (FromType == ToType)
2318 return From;
2319 if (FromSize == ToSize)
2320 return Builder.CreateBitCast(From, ToType);
2321 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2322 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2323 InsertPointTy SaveIP = Builder.saveIP();
2324 Builder.restoreIP(AllocaIP);
2325 Value *CastItem = Builder.CreateAlloca(ToType);
2326 Builder.restoreIP(SaveIP);
2327
2329 CastItem, Builder.getPtrTy(0));
2330 Builder.CreateStore(From, ValCastItem);
2331 return Builder.CreateLoad(ToType, CastItem);
2332}
2333
2334Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2335 Value *Element,
2336 Type *ElementType,
2337 Value *Offset) {
2338 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2339 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2340
2341 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2342 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2343 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2344 Value *WarpSize =
2345 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2347 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2348 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2349 Value *WarpSizeCast =
2350 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2351 Value *ShuffleCall =
2352 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2353 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2354}
2355
2356void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2357 Value *DstAddr, Type *ElemType,
2358 Value *Offset, Type *ReductionArrayTy) {
2360 // Create the loop over the big sized data.
2361 // ptr = (void*)Elem;
2362 // ptrEnd = (void*) Elem + 1;
2363 // Step = 8;
2364 // while (ptr + Step < ptrEnd)
2365 // shuffle((int64_t)*ptr);
2366 // Step = 4;
2367 // while (ptr + Step < ptrEnd)
2368 // shuffle((int32_t)*ptr);
2369 // ...
2370 Type *IndexTy = Builder.getIndexTy(
2372 Value *ElemPtr = DstAddr;
2373 Value *Ptr = SrcAddr;
2374 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2375 if (Size < IntSize)
2376 continue;
2377 Type *IntType = Builder.getIntNTy(IntSize * 8);
2379 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2380 Value *SrcAddrGEP =
2381 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2383 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2384
2385 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2386 if ((Size / IntSize) > 1) {
2388 SrcAddrGEP, Builder.getPtrTy());
2389 BasicBlock *PreCondBB =
2390 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2391 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2392 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2393 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2394 emitBlock(PreCondBB, CurFunc);
2395 PHINode *PhiSrc =
2396 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2397 PhiSrc->addIncoming(Ptr, CurrentBB);
2398 PHINode *PhiDest =
2399 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2400 PhiDest->addIncoming(ElemPtr, CurrentBB);
2401 Ptr = PhiSrc;
2402 ElemPtr = PhiDest;
2403 Value *PtrDiff = Builder.CreatePtrDiff(
2404 Builder.getInt8Ty(), PtrEnd,
2407 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2408 ExitBB);
2409 emitBlock(ThenBB, CurFunc);
2410 Value *Res = createRuntimeShuffleFunction(
2411 AllocaIP,
2413 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2414 IntType, Offset);
2415 Builder.CreateAlignedStore(Res, ElemPtr,
2416 M.getDataLayout().getPrefTypeAlign(ElemType));
2417 Value *LocalPtr =
2418 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2419 Value *LocalElemPtr =
2420 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2421 PhiSrc->addIncoming(LocalPtr, ThenBB);
2422 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2423 emitBranch(PreCondBB);
2424 emitBlock(ExitBB, CurFunc);
2425 } else {
2426 Value *Res = createRuntimeShuffleFunction(
2427 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2428 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2429 Res->getType()->getScalarSizeInBits())
2430 Res = Builder.CreateTrunc(Res, ElemType);
2431 Builder.CreateStore(Res, ElemPtr);
2432 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2433 ElemPtr =
2434 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2435 }
2436 Size = Size % IntSize;
2437 }
2438}
2439
2440void OpenMPIRBuilder::emitReductionListCopy(
2441 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2442 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2443 CopyOptionsTy CopyOptions) {
2444 Type *IndexTy = Builder.getIndexTy(
2446 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2447
2448 // Iterates, element-by-element, through the source Reduce list and
2449 // make a copy.
2450 for (auto En : enumerate(ReductionInfos)) {
2451 const ReductionInfo &RI = En.value();
2452 Value *SrcElementAddr = nullptr;
2453 Value *DestElementAddr = nullptr;
2454 Value *DestElementPtrAddr = nullptr;
2455 // Should we shuffle in an element from a remote lane?
2456 bool ShuffleInElement = false;
2457 // Set to true to update the pointer in the dest Reduce list to a
2458 // newly created element.
2459 bool UpdateDestListPtr = false;
2460
2461 // Step 1.1: Get the address for the src element in the Reduce list.
2462 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2463 ReductionArrayTy, SrcBase,
2464 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2465 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2466
2467 // Step 1.2: Create a temporary to store the element in the destination
2468 // Reduce list.
2469 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2470 ReductionArrayTy, DestBase,
2471 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2472 switch (Action) {
2474 InsertPointTy CurIP = Builder.saveIP();
2475 Builder.restoreIP(AllocaIP);
2476 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2477 ".omp.reduction.element");
2478 DestAlloca->setAlignment(
2479 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2480 DestElementAddr = DestAlloca;
2481 DestElementAddr =
2482 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2483 DestElementAddr->getName() + ".ascast");
2484 Builder.restoreIP(CurIP);
2485 ShuffleInElement = true;
2486 UpdateDestListPtr = true;
2487 break;
2488 }
2490 DestElementAddr =
2491 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2492 break;
2493 }
2494 }
2495
2496 // Now that all active lanes have read the element in the
2497 // Reduce list, shuffle over the value from the remote lane.
2498 if (ShuffleInElement) {
2499 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2500 RemoteLaneOffset, ReductionArrayTy);
2501 } else {
2502 switch (RI.EvaluationKind) {
2503 case EvalKind::Scalar: {
2504 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2505 // Store the source element value to the dest element address.
2506 Builder.CreateStore(Elem, DestElementAddr);
2507 break;
2508 }
2509 case EvalKind::Complex: {
2511 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2512 Value *SrcReal = Builder.CreateLoad(
2513 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2515 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2516 Value *SrcImg = Builder.CreateLoad(
2517 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2518
2520 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2522 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2523 Builder.CreateStore(SrcReal, DestRealPtr);
2524 Builder.CreateStore(SrcImg, DestImgPtr);
2525 break;
2526 }
2527 case EvalKind::Aggregate: {
2528 Value *SizeVal = Builder.getInt64(
2529 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2531 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2532 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2533 SizeVal, false);
2534 break;
2535 }
2536 };
2537 }
2538
2539 // Step 3.1: Modify reference in dest Reduce list as needed.
2540 // Modifying the reference in Reduce list to point to the newly
2541 // created element. The element is live in the current function
2542 // scope and that of functions it invokes (i.e., reduce_function).
2543 // RemoteReduceData[i] = (void*)&RemoteElem
2544 if (UpdateDestListPtr) {
2546 DestElementAddr, Builder.getPtrTy(),
2547 DestElementAddr->getName() + ".ascast");
2548 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2549 }
2550 }
2551}
2552
2553Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2554 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2555 AttributeList FuncAttrs) {
2556 InsertPointTy SavedIP = Builder.saveIP();
2557 LLVMContext &Ctx = M.getContext();
2559 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2560 /* IsVarArg */ false);
2561 Function *WcFunc =
2563 "_omp_reduction_inter_warp_copy_func", &M);
2564 WcFunc->setAttributes(FuncAttrs);
2565 WcFunc->addParamAttr(0, Attribute::NoUndef);
2566 WcFunc->addParamAttr(1, Attribute::NoUndef);
2567 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2568 Builder.SetInsertPoint(EntryBB);
2569
2570 // ReduceList: thread local Reduce list.
2571 // At the stage of the computation when this function is called, partially
2572 // aggregated values reside in the first lane of every active warp.
2573 Argument *ReduceListArg = WcFunc->getArg(0);
2574 // NumWarps: number of warps active in the parallel region. This could
2575 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2576 Argument *NumWarpsArg = WcFunc->getArg(1);
2577
2578 // This array is used as a medium to transfer, one reduce element at a time,
2579 // the data from the first lane of every warp to lanes in the first warp
2580 // in order to perform the final step of a reduction in a parallel region
2581 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2582 // for reduced latency, as well as to have a distinct copy for concurrently
2583 // executing target regions. The array is declared with common linkage so
2584 // as to be shared across compilation units.
2585 StringRef TransferMediumName =
2586 "__openmp_nvptx_data_transfer_temporary_storage";
2587 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2588 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2589 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2590 if (!TransferMedium) {
2591 TransferMedium = new GlobalVariable(
2592 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2593 UndefValue::get(ArrayTy), TransferMediumName,
2594 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2595 /*AddressSpace=*/3);
2596 }
2597
2598 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2599 Value *GPUThreadID = getGPUThreadID();
2600 // nvptx_lane_id = nvptx_id % warpsize
2601 Value *LaneID = getNVPTXLaneID();
2602 // nvptx_warp_id = nvptx_id / warpsize
2603 Value *WarpID = getNVPTXWarpID();
2604
2605 InsertPointTy AllocaIP =
2608 Type *Arg0Type = ReduceListArg->getType();
2609 Type *Arg1Type = NumWarpsArg->getType();
2610 Builder.restoreIP(AllocaIP);
2611 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2612 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2613 AllocaInst *NumWarpsAlloca =
2614 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2616 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2618 NumWarpsAlloca, Builder.getPtrTy(0),
2619 NumWarpsAlloca->getName() + ".ascast");
2620 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2621 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2622 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2623 InsertPointTy CodeGenIP =
2625 Builder.restoreIP(CodeGenIP);
2626
2627 Value *ReduceList =
2628 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2629
2630 for (auto En : enumerate(ReductionInfos)) {
2631 //
2632 // Warp master copies reduce element to transfer medium in __shared__
2633 // memory.
2634 //
2635 const ReductionInfo &RI = En.value();
2636 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2637 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2638 Type *CType = Builder.getIntNTy(TySize * 8);
2639
2640 unsigned NumIters = RealTySize / TySize;
2641 if (NumIters == 0)
2642 continue;
2643 Value *Cnt = nullptr;
2644 Value *CntAddr = nullptr;
2645 BasicBlock *PrecondBB = nullptr;
2646 BasicBlock *ExitBB = nullptr;
2647 if (NumIters > 1) {
2648 CodeGenIP = Builder.saveIP();
2649 Builder.restoreIP(AllocaIP);
2650 CntAddr =
2651 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2652
2653 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2654 CntAddr->getName() + ".ascast");
2655 Builder.restoreIP(CodeGenIP);
2657 CntAddr,
2658 /*Volatile=*/false);
2659 PrecondBB = BasicBlock::Create(Ctx, "precond");
2660 ExitBB = BasicBlock::Create(Ctx, "exit");
2661 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2662 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2663 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2664 /*Volatile=*/false);
2666 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2667 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2669 }
2670
2671 // kmpc_barrier.
2672 InsertPointOrErrorTy BarrierIP1 =
2673 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2674 omp::Directive::OMPD_unknown,
2675 /* ForceSimpleCall */ false,
2676 /* CheckCancelFlag */ true);
2677 if (!BarrierIP1)
2678 return BarrierIP1.takeError();
2679 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2680 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2681 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2682
2683 // if (lane_id == 0)
2684 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2685 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2687
2688 // Reduce element = LocalReduceList[i]
2689 auto *RedListArrayTy =
2690 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2691 Type *IndexTy = Builder.getIndexTy(
2693 Value *ElemPtrPtr =
2694 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2695 {ConstantInt::get(IndexTy, 0),
2696 ConstantInt::get(IndexTy, En.index())});
2697 // elemptr = ((CopyType*)(elemptrptr)) + I
2698 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2699 if (NumIters > 1)
2700 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2701
2702 // Get pointer to location in transfer medium.
2703 // MediumPtr = &medium[warp_id]
2704 Value *MediumPtr = Builder.CreateInBoundsGEP(
2705 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2706 // elem = *elemptr
2707 //*MediumPtr = elem
2708 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2709 // Store the source element value to the dest element address.
2710 Builder.CreateStore(Elem, MediumPtr,
2711 /*IsVolatile*/ true);
2712 Builder.CreateBr(MergeBB);
2713
2714 // else
2716 Builder.CreateBr(MergeBB);
2717
2718 // endif
2720 InsertPointOrErrorTy BarrierIP2 =
2721 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2722 omp::Directive::OMPD_unknown,
2723 /* ForceSimpleCall */ false,
2724 /* CheckCancelFlag */ true);
2725 if (!BarrierIP2)
2726 return BarrierIP2.takeError();
2727
2728 // Warp 0 copies reduce element from transfer medium
2729 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2730 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2731 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2732
2733 Value *NumWarpsVal =
2734 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2735 // Up to 32 threads in warp 0 are active.
2736 Value *IsActiveThread =
2737 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2738 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2739
2740 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2741
2742 // SecMediumPtr = &medium[tid]
2743 // SrcMediumVal = *SrcMediumPtr
2744 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2745 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2746 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2747 Value *TargetElemPtrPtr =
2748 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2749 {ConstantInt::get(IndexTy, 0),
2750 ConstantInt::get(IndexTy, En.index())});
2751 Value *TargetElemPtrVal =
2752 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2753 Value *TargetElemPtr = TargetElemPtrVal;
2754 if (NumIters > 1)
2755 TargetElemPtr =
2756 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2757
2758 // *TargetElemPtr = SrcMediumVal;
2759 Value *SrcMediumValue =
2760 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2761 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2762 Builder.CreateBr(W0MergeBB);
2763
2764 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2765 Builder.CreateBr(W0MergeBB);
2766
2767 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2768
2769 if (NumIters > 1) {
2770 Cnt = Builder.CreateNSWAdd(
2771 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2772 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2773
2774 auto *CurFn = Builder.GetInsertBlock()->getParent();
2775 emitBranch(PrecondBB);
2776 emitBlock(ExitBB, CurFn);
2777 }
2778 RealTySize %= TySize;
2779 }
2780 }
2781
2783 Builder.restoreIP(SavedIP);
2784
2785 return WcFunc;
2786}
2787
2788Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2789 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2790 AttributeList FuncAttrs) {
2791 LLVMContext &Ctx = M.getContext();
2792 FunctionType *FuncTy =
2794 {Builder.getPtrTy(), Builder.getInt16Ty(),
2795 Builder.getInt16Ty(), Builder.getInt16Ty()},
2796 /* IsVarArg */ false);
2797 Function *SarFunc =
2799 "_omp_reduction_shuffle_and_reduce_func", &M);
2800 SarFunc->setAttributes(FuncAttrs);
2801 SarFunc->addParamAttr(0, Attribute::NoUndef);
2802 SarFunc->addParamAttr(1, Attribute::NoUndef);
2803 SarFunc->addParamAttr(2, Attribute::NoUndef);
2804 SarFunc->addParamAttr(3, Attribute::NoUndef);
2805 SarFunc->addParamAttr(1, Attribute::SExt);
2806 SarFunc->addParamAttr(2, Attribute::SExt);
2807 SarFunc->addParamAttr(3, Attribute::SExt);
2808 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2809 Builder.SetInsertPoint(EntryBB);
2810
2811 // Thread local Reduce list used to host the values of data to be reduced.
2812 Argument *ReduceListArg = SarFunc->getArg(0);
2813 // Current lane id; could be logical.
2814 Argument *LaneIDArg = SarFunc->getArg(1);
2815 // Offset of the remote source lane relative to the current lane.
2816 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2817 // Algorithm version. This is expected to be known at compile time.
2818 Argument *AlgoVerArg = SarFunc->getArg(3);
2819
2820 Type *ReduceListArgType = ReduceListArg->getType();
2821 Type *LaneIDArgType = LaneIDArg->getType();
2822 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2823 Value *ReduceListAlloca = Builder.CreateAlloca(
2824 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2825 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2826 LaneIDArg->getName() + ".addr");
2827 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2828 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2829 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2830 AlgoVerArg->getName() + ".addr");
2831 ArrayType *RedListArrayTy =
2832 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2833
2834 // Create a local thread-private variable to host the Reduce list
2835 // from a remote lane.
2836 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2837 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2838
2840 ReduceListAlloca, ReduceListArgType,
2841 ReduceListAlloca->getName() + ".ascast");
2843 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2844 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2845 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2846 RemoteLaneOffsetAlloca->getName() + ".ascast");
2848 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2850 RemoteReductionListAlloca, Builder.getPtrTy(),
2851 RemoteReductionListAlloca->getName() + ".ascast");
2852
2853 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2854 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2855 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2856 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2857
2858 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2859 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2860 Value *RemoteLaneOffset =
2861 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2862 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2863
2864 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2865
2866 // This loop iterates through the list of reduce elements and copies,
2867 // element by element, from a remote lane in the warp to RemoteReduceList,
2868 // hosted on the thread's stack.
2869 emitReductionListCopy(
2870 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2871 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2872
2873 // The actions to be performed on the Remote Reduce list is dependent
2874 // on the algorithm version.
2875 //
2876 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2877 // LaneId % 2 == 0 && Offset > 0):
2878 // do the reduction value aggregation
2879 //
2880 // The thread local variable Reduce list is mutated in place to host the
2881 // reduced data, which is the aggregated value produced from local and
2882 // remote lanes.
2883 //
2884 // Note that AlgoVer is expected to be a constant integer known at compile
2885 // time.
2886 // When AlgoVer==0, the first conjunction evaluates to true, making
2887 // the entire predicate true during compile time.
2888 // When AlgoVer==1, the second conjunction has only the second part to be
2889 // evaluated during runtime. Other conjunctions evaluates to false
2890 // during compile time.
2891 // When AlgoVer==2, the third conjunction has only the second part to be
2892 // evaluated during runtime. Other conjunctions evaluates to false
2893 // during compile time.
2894 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2895 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2896 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2897 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2898 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2899 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2900 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2901 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2902 Value *RemoteOffsetComp =
2903 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2904 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2905 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2906 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2907
2908 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2909 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2910 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2911
2912 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2915 ReduceList, Builder.getPtrTy());
2916 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2917 RemoteListAddrCast, Builder.getPtrTy());
2918 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2919 ->addFnAttr(Attribute::NoUnwind);
2920 Builder.CreateBr(MergeBB);
2921
2923 Builder.CreateBr(MergeBB);
2924
2926
2927 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2928 // Reduce list.
2929 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2930 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2931 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2932
2933 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2934 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2935 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2936 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2937
2938 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2939 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2940 ReductionInfos, RemoteListAddrCast, ReduceList);
2941 Builder.CreateBr(CpyMergeBB);
2942
2943 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2944 Builder.CreateBr(CpyMergeBB);
2945
2946 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2947
2949
2950 return SarFunc;
2951}
2952
2953Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2954 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2955 AttributeList FuncAttrs) {
2957 LLVMContext &Ctx = M.getContext();
2960 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2961 /* IsVarArg */ false);
2962 Function *LtGCFunc =
2964 "_omp_reduction_list_to_global_copy_func", &M);
2965 LtGCFunc->setAttributes(FuncAttrs);
2966 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2967 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2968 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2969
2970 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2971 Builder.SetInsertPoint(EntryBlock);
2972
2973 // Buffer: global reduction buffer.
2974 Argument *BufferArg = LtGCFunc->getArg(0);
2975 // Idx: index of the buffer.
2976 Argument *IdxArg = LtGCFunc->getArg(1);
2977 // ReduceList: thread local Reduce list.
2978 Argument *ReduceListArg = LtGCFunc->getArg(2);
2979
2980 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2981 BufferArg->getName() + ".addr");
2982 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2983 IdxArg->getName() + ".addr");
2984 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2985 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2987 BufferArgAlloca, Builder.getPtrTy(),
2988 BufferArgAlloca->getName() + ".ascast");
2990 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2991 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2992 ReduceListArgAlloca, Builder.getPtrTy(),
2993 ReduceListArgAlloca->getName() + ".ascast");
2994
2995 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2996 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2997 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2998
2999 Value *LocalReduceList =
3000 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3001 Value *BufferArgVal =
3002 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3003 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3004 Type *IndexTy = Builder.getIndexTy(
3006 for (auto En : enumerate(ReductionInfos)) {
3007 const ReductionInfo &RI = En.value();
3008 auto *RedListArrayTy =
3009 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3010 // Reduce element = LocalReduceList[i]
3011 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3012 RedListArrayTy, LocalReduceList,
3013 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3014 // elemptr = ((CopyType*)(elemptrptr)) + I
3015 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3016
3017 // Global = Buffer.VD[Idx];
3018 Value *BufferVD =
3019 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3021 ReductionsBufferTy, BufferVD, 0, En.index());
3022
3023 switch (RI.EvaluationKind) {
3024 case EvalKind::Scalar: {
3025 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3026 Builder.CreateStore(TargetElement, GlobVal);
3027 break;
3028 }
3029 case EvalKind::Complex: {
3031 RI.ElementType, ElemPtr, 0, 0, ".realp");
3032 Value *SrcReal = Builder.CreateLoad(
3033 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3035 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3036 Value *SrcImg = Builder.CreateLoad(
3037 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3038
3040 RI.ElementType, GlobVal, 0, 0, ".realp");
3042 RI.ElementType, GlobVal, 0, 1, ".imagp");
3043 Builder.CreateStore(SrcReal, DestRealPtr);
3044 Builder.CreateStore(SrcImg, DestImgPtr);
3045 break;
3046 }
3047 case EvalKind::Aggregate: {
3048 Value *SizeVal =
3049 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3051 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3052 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3053 break;
3054 }
3055 }
3056 }
3057
3059 Builder.restoreIP(OldIP);
3060 return LtGCFunc;
3061}
3062
3063Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3064 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3065 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3067 LLVMContext &Ctx = M.getContext();
3070 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3071 /* IsVarArg */ false);
3072 Function *LtGRFunc =
3074 "_omp_reduction_list_to_global_reduce_func", &M);
3075 LtGRFunc->setAttributes(FuncAttrs);
3076 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3077 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3078 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3079
3080 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3081 Builder.SetInsertPoint(EntryBlock);
3082
3083 // Buffer: global reduction buffer.
3084 Argument *BufferArg = LtGRFunc->getArg(0);
3085 // Idx: index of the buffer.
3086 Argument *IdxArg = LtGRFunc->getArg(1);
3087 // ReduceList: thread local Reduce list.
3088 Argument *ReduceListArg = LtGRFunc->getArg(2);
3089
3090 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3091 BufferArg->getName() + ".addr");
3092 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3093 IdxArg->getName() + ".addr");
3094 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3095 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3096 auto *RedListArrayTy =
3097 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3098
3099 // 1. Build a list of reduction variables.
3100 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3101 Value *LocalReduceList =
3102 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3103
3105 BufferArgAlloca, Builder.getPtrTy(),
3106 BufferArgAlloca->getName() + ".ascast");
3108 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3109 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3110 ReduceListArgAlloca, Builder.getPtrTy(),
3111 ReduceListArgAlloca->getName() + ".ascast");
3112 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3113 LocalReduceList, Builder.getPtrTy(),
3114 LocalReduceList->getName() + ".ascast");
3115
3116 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3117 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3118 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3119
3120 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3121 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3122 Type *IndexTy = Builder.getIndexTy(
3124 for (auto En : enumerate(ReductionInfos)) {
3125 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3126 RedListArrayTy, LocalReduceListAddrCast,
3127 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3128 Value *BufferVD =
3129 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3130 // Global = Buffer.VD[Idx];
3132 ReductionsBufferTy, BufferVD, 0, En.index());
3133 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3134 }
3135
3136 // Call reduce_function(GlobalReduceList, ReduceList)
3137 Value *ReduceList =
3138 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3139 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3140 ->addFnAttr(Attribute::NoUnwind);
3142 Builder.restoreIP(OldIP);
3143 return LtGRFunc;
3144}
3145
3146Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3147 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3148 AttributeList FuncAttrs) {
3150 LLVMContext &Ctx = M.getContext();
3153 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3154 /* IsVarArg */ false);
3155 Function *LtGCFunc =
3157 "_omp_reduction_global_to_list_copy_func", &M);
3158 LtGCFunc->setAttributes(FuncAttrs);
3159 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3160 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3161 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3162
3163 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3164 Builder.SetInsertPoint(EntryBlock);
3165
3166 // Buffer: global reduction buffer.
3167 Argument *BufferArg = LtGCFunc->getArg(0);
3168 // Idx: index of the buffer.
3169 Argument *IdxArg = LtGCFunc->getArg(1);
3170 // ReduceList: thread local Reduce list.
3171 Argument *ReduceListArg = LtGCFunc->getArg(2);
3172
3173 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3174 BufferArg->getName() + ".addr");
3175 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3176 IdxArg->getName() + ".addr");
3177 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3178 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3180 BufferArgAlloca, Builder.getPtrTy(),
3181 BufferArgAlloca->getName() + ".ascast");
3183 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3184 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3185 ReduceListArgAlloca, Builder.getPtrTy(),
3186 ReduceListArgAlloca->getName() + ".ascast");
3187 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3188 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3189 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3190
3191 Value *LocalReduceList =
3192 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3193 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3194 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3195 Type *IndexTy = Builder.getIndexTy(
3197 for (auto En : enumerate(ReductionInfos)) {
3198 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3199 auto *RedListArrayTy =
3200 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3201 // Reduce element = LocalReduceList[i]
3202 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3203 RedListArrayTy, LocalReduceList,
3204 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3205 // elemptr = ((CopyType*)(elemptrptr)) + I
3206 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3207 // Global = Buffer.VD[Idx];
3208 Value *BufferVD =
3209 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3211 ReductionsBufferTy, BufferVD, 0, En.index());
3212
3213 switch (RI.EvaluationKind) {
3214 case EvalKind::Scalar: {
3215 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3216 Builder.CreateStore(TargetElement, ElemPtr);
3217 break;
3218 }
3219 case EvalKind::Complex: {
3221 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3222 Value *SrcReal = Builder.CreateLoad(
3223 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3225 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3226 Value *SrcImg = Builder.CreateLoad(
3227 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3228
3230 RI.ElementType, ElemPtr, 0, 0, ".realp");
3232 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3233 Builder.CreateStore(SrcReal, DestRealPtr);
3234 Builder.CreateStore(SrcImg, DestImgPtr);
3235 break;
3236 }
3237 case EvalKind::Aggregate: {
3238 Value *SizeVal =
3242 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3243 SizeVal, false);
3244 break;
3245 }
3246 }
3247 }
3248
3250 Builder.restoreIP(OldIP);
3251 return LtGCFunc;
3252}
3253
3254Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3255 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3256 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3258 LLVMContext &Ctx = M.getContext();
3259 auto *FuncTy = FunctionType::get(
3261 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3262 /* IsVarArg */ false);
3263 Function *LtGRFunc =
3265 "_omp_reduction_global_to_list_reduce_func", &M);
3266 LtGRFunc->setAttributes(FuncAttrs);
3267 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3268 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3269 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3270
3271 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3272 Builder.SetInsertPoint(EntryBlock);
3273
3274 // Buffer: global reduction buffer.
3275 Argument *BufferArg = LtGRFunc->getArg(0);
3276 // Idx: index of the buffer.
3277 Argument *IdxArg = LtGRFunc->getArg(1);
3278 // ReduceList: thread local Reduce list.
3279 Argument *ReduceListArg = LtGRFunc->getArg(2);
3280
3281 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3282 BufferArg->getName() + ".addr");
3283 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3284 IdxArg->getName() + ".addr");
3285 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3286 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3287 ArrayType *RedListArrayTy =
3288 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3289
3290 // 1. Build a list of reduction variables.
3291 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3292 Value *LocalReduceList =
3293 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3294
3296 BufferArgAlloca, Builder.getPtrTy(),
3297 BufferArgAlloca->getName() + ".ascast");
3299 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3300 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3301 ReduceListArgAlloca, Builder.getPtrTy(),
3302 ReduceListArgAlloca->getName() + ".ascast");
3304 LocalReduceList, Builder.getPtrTy(),
3305 LocalReduceList->getName() + ".ascast");
3306
3307 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3308 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3309 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3310
3311 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3312 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3313 Type *IndexTy = Builder.getIndexTy(
3315 for (auto En : enumerate(ReductionInfos)) {
3316 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3317 RedListArrayTy, ReductionList,
3318 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3319 // Global = Buffer.VD[Idx];
3320 Value *BufferVD =
3321 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3323 ReductionsBufferTy, BufferVD, 0, En.index());
3324 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3325 }
3326
3327 // Call reduce_function(ReduceList, GlobalReduceList)
3328 Value *ReduceList =
3329 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3330 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3331 ->addFnAttr(Attribute::NoUnwind);
3333 Builder.restoreIP(OldIP);
3334 return LtGRFunc;
3335}
3336
3337std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3338 std::string Suffix =
3339 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3340 return (Name + Suffix).str();
3341}
3342
3343Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3344 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3345 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3346 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3347 {Builder.getPtrTy(), Builder.getPtrTy()},
3348 /* IsVarArg */ false);
3349 std::string Name = getReductionFuncName(ReducerName);
3350 Function *ReductionFunc =
3352 ReductionFunc->setAttributes(FuncAttrs);
3353 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3354 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3355 BasicBlock *EntryBB =
3356 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3357 Builder.SetInsertPoint(EntryBB);
3358
3359 // Need to alloca memory here and deal with the pointers before getting
3360 // LHS/RHS pointers out
3361 Value *LHSArrayPtr = nullptr;
3362 Value *RHSArrayPtr = nullptr;
3363 Argument *Arg0 = ReductionFunc->getArg(0);
3364 Argument *Arg1 = ReductionFunc->getArg(1);
3365 Type *Arg0Type = Arg0->getType();
3366 Type *Arg1Type = Arg1->getType();
3367
3368 Value *LHSAlloca =
3369 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3370 Value *RHSAlloca =
3371 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3373 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3375 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3376 Builder.CreateStore(Arg0, LHSAddrCast);
3377 Builder.CreateStore(Arg1, RHSAddrCast);
3378 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3379 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3380
3381 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3382 Type *IndexTy = Builder.getIndexTy(
3384 SmallVector<Value *> LHSPtrs, RHSPtrs;
3385 for (auto En : enumerate(ReductionInfos)) {
3386 const ReductionInfo &RI = En.value();
3387 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3388 RedArrayTy, RHSArrayPtr,
3389 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3390 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3392 RHSI8Ptr, RI.PrivateVariable->getType(),
3393 RHSI8Ptr->getName() + ".ascast");
3394
3395 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3396 RedArrayTy, LHSArrayPtr,
3397 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3398 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3400 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3401
3403 LHSPtrs.emplace_back(LHSPtr);
3404 RHSPtrs.emplace_back(RHSPtr);
3405 } else {
3406 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3407 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3408 Value *Reduced;
3409 InsertPointOrErrorTy AfterIP =
3410 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3411 if (!AfterIP)
3412 return AfterIP.takeError();
3413 if (!Builder.GetInsertBlock())
3414 return ReductionFunc;
3415 Builder.CreateStore(Reduced, LHSPtr);
3416 }
3417 }
3418
3420 for (auto En : enumerate(ReductionInfos)) {
3421 unsigned Index = En.index();
3422 const ReductionInfo &RI = En.value();
3423 Value *LHSFixupPtr, *RHSFixupPtr;
3424 Builder.restoreIP(RI.ReductionGenClang(
3425 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3426
3427 // Fix the CallBack code genereated to use the correct Values for the LHS
3428 // and RHS
3429 LHSFixupPtr->replaceUsesWithIf(
3430 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3431 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3432 ReductionFunc;
3433 });
3434 RHSFixupPtr->replaceUsesWithIf(
3435 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3436 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3437 ReductionFunc;
3438 });
3439 }
3440
3442 return ReductionFunc;
3443}
3444
3445static void
3447 bool IsGPU) {
3448 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3449 (void)RI;
3450 assert(RI.Variable && "expected non-null variable");
3451 assert(RI.PrivateVariable && "expected non-null private variable");
3452 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3453 "expected non-null reduction generator callback");
3454 if (!IsGPU) {
3455 assert(
3456 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3457 "expected variables and their private equivalents to have the same "
3458 "type");
3459 }
3460 assert(RI.Variable->getType()->isPointerTy() &&
3461 "expected variables to be pointers");
3462 }
3463}
3464
3466 const LocationDescription &Loc, InsertPointTy AllocaIP,
3467 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3468 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3469 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3470 unsigned ReductionBufNum, Value *SrcLocInfo) {
3471 if (!updateToLocation(Loc))
3472 return InsertPointTy();
3473 Builder.restoreIP(CodeGenIP);
3474 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3475 LLVMContext &Ctx = M.getContext();
3476
3477 // Source location for the ident struct
3478 if (!SrcLocInfo) {
3479 uint32_t SrcLocStrSize;
3480 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3481 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3482 }
3483
3484 if (ReductionInfos.size() == 0)
3485 return Builder.saveIP();
3486
3487 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3488 AttributeList FuncAttrs;
3489 AttrBuilder AttrBldr(Ctx);
3490 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3491 AttrBldr.addAttribute(Attr);
3492 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3493 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3494
3495 CodeGenIP = Builder.saveIP();
3496 Expected<Function *> ReductionResult =
3497 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3498 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3499 if (!ReductionResult)
3500 return ReductionResult.takeError();
3501 Function *ReductionFunc = *ReductionResult;
3502 Builder.restoreIP(CodeGenIP);
3503
3504 // Set the grid value in the config needed for lowering later on
3505 if (GridValue.has_value())
3506 Config.setGridValue(GridValue.value());
3507 else
3508 Config.setGridValue(getGridValue(T, ReductionFunc));
3509
3510 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3511 // RedList, shuffle_reduce_func, interwarp_copy_func);
3512 // or
3513 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3514 Value *Res;
3515
3516 // 1. Build a list of reduction variables.
3517 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3518 auto Size = ReductionInfos.size();
3519 Type *PtrTy = PointerType::getUnqual(Ctx);
3520 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3521 CodeGenIP = Builder.saveIP();
3522 Builder.restoreIP(AllocaIP);
3523 Value *ReductionListAlloca =
3524 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3526 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3527 Builder.restoreIP(CodeGenIP);
3528 Type *IndexTy = Builder.getIndexTy(
3530 for (auto En : enumerate(ReductionInfos)) {
3531 const ReductionInfo &RI = En.value();
3532 Value *ElemPtr = Builder.CreateInBoundsGEP(
3533 RedArrayTy, ReductionList,
3534 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3535 Value *CastElem =
3537 Builder.CreateStore(CastElem, ElemPtr);
3538 }
3539 CodeGenIP = Builder.saveIP();
3540 Function *SarFunc =
3541 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3542 Expected<Function *> CopyResult =
3543 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3544 if (!CopyResult)
3545 return CopyResult.takeError();
3546 Function *WcFunc = *CopyResult;
3547 Builder.restoreIP(CodeGenIP);
3548
3549 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3550
3551 unsigned MaxDataSize = 0;
3552 SmallVector<Type *> ReductionTypeArgs;
3553 for (auto En : enumerate(ReductionInfos)) {
3554 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3555 if (Size > MaxDataSize)
3556 MaxDataSize = Size;
3557 ReductionTypeArgs.emplace_back(En.value().ElementType);
3558 }
3559 Value *ReductionDataSize =
3560 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3561 if (!IsTeamsReduction) {
3562 Value *SarFuncCast =
3564 Value *WcFuncCast =
3566 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3567 WcFuncCast};
3569 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3570 Res = Builder.CreateCall(Pv2Ptr, Args);
3571 } else {
3572 CodeGenIP = Builder.saveIP();
3573 StructType *ReductionsBufferTy = StructType::create(
3574 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3575 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3576 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3577 Function *LtGCFunc = emitListToGlobalCopyFunction(
3578 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3579 Function *LtGRFunc = emitListToGlobalReduceFunction(
3580 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3581 Function *GtLCFunc = emitGlobalToListCopyFunction(
3582 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3583 Function *GtLRFunc = emitGlobalToListReduceFunction(
3584 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3585 Builder.restoreIP(CodeGenIP);
3586
3587 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3588 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3589
3590 Value *Args3[] = {SrcLocInfo,
3591 KernelTeamsReductionPtr,
3592 Builder.getInt32(ReductionBufNum),
3593 ReductionDataSize,
3594 RL,
3595 SarFunc,
3596 WcFunc,
3597 LtGCFunc,
3598 LtGRFunc,
3599 GtLCFunc,
3600 GtLRFunc};
3601
3602 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3603 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3604 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3605 }
3606
3607 // 5. Build if (res == 1)
3608 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3609 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3611 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3612
3613 // 6. Build then branch: where we have reduced values in the master
3614 // thread in each team.
3615 // __kmpc_end_reduce{_nowait}(<gtid>);
3616 // break;
3617 emitBlock(ThenBB, CurFunc);
3618
3619 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3620 for (auto En : enumerate(ReductionInfos)) {
3621 const ReductionInfo &RI = En.value();
3622 Value *LHS = RI.Variable;
3623 Value *RHS =
3625
3627 Value *LHSPtr, *RHSPtr;
3629 &LHSPtr, &RHSPtr, CurFunc));
3630
3631 // Fix the CallBack code genereated to use the correct Values for the LHS
3632 // and RHS
3633 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3634 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3635 ReductionFunc;
3636 });
3637 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3638 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3639 ReductionFunc;
3640 });
3641 } else {
3642 assert(false && "Unhandled ReductionGenCBKind");
3643 }
3644 }
3645 emitBlock(ExitBB, CurFunc);
3646
3648
3649 return Builder.saveIP();
3650}
3651
3653 Type *VoidTy = Type::getVoidTy(M.getContext());
3654 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3655 auto *FuncTy =
3656 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3658 ".omp.reduction.func", &M);
3659}
3660
3663 InsertPointTy AllocaIP,
3664 ArrayRef<ReductionInfo> ReductionInfos,
3665 ArrayRef<bool> IsByRef, bool IsNoWait) {
3666 assert(ReductionInfos.size() == IsByRef.size());
3667 for (const ReductionInfo &RI : ReductionInfos) {
3668 (void)RI;
3669 assert(RI.Variable && "expected non-null variable");
3670 assert(RI.PrivateVariable && "expected non-null private variable");
3671 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3672 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3673 "expected variables and their private equivalents to have the same "
3674 "type");
3675 assert(RI.Variable->getType()->isPointerTy() &&
3676 "expected variables to be pointers");
3677 }
3678
3679 if (!updateToLocation(Loc))
3680 return InsertPointTy();
3681
3682 BasicBlock *InsertBlock = Loc.IP.getBlock();
3683 BasicBlock *ContinuationBlock =
3684 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3685 InsertBlock->getTerminator()->eraseFromParent();
3686
3687 // Create and populate array of type-erased pointers to private reduction
3688 // values.
3689 unsigned NumReductions = ReductionInfos.size();
3690 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3692 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3693
3694 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3695
3696 for (auto En : enumerate(ReductionInfos)) {
3697 unsigned Index = En.index();
3698 const ReductionInfo &RI = En.value();
3699 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3700 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3701 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3702 }
3703
3704 // Emit a call to the runtime function that orchestrates the reduction.
3705 // Declare the reduction function in the process.
3707 Module *Module = Func->getParent();
3708 uint32_t SrcLocStrSize;
3709 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3710 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3711 return RI.AtomicReductionGen;
3712 });
3713 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3714 CanGenerateAtomic
3715 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3716 : IdentFlag(0));
3717 Value *ThreadId = getOrCreateThreadID(Ident);
3718 Constant *NumVariables = Builder.getInt32(NumReductions);
3719 const DataLayout &DL = Module->getDataLayout();
3720 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3721 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3722 Function *ReductionFunc = getFreshReductionFunc(*Module);
3723 Value *Lock = getOMPCriticalRegionLock(".reduction");
3725 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3726 : RuntimeFunction::OMPRTL___kmpc_reduce);
3727 CallInst *ReduceCall =
3728 Builder.CreateCall(ReduceFunc,
3729 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3730 ReductionFunc, Lock},
3731 "reduce");
3732
3733 // Create final reduction entry blocks for the atomic and non-atomic case.
3734 // Emit IR that dispatches control flow to one of the blocks based on the
3735 // reduction supporting the atomic mode.
3736 BasicBlock *NonAtomicRedBlock =
3737 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3738 BasicBlock *AtomicRedBlock =
3739 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3740 SwitchInst *Switch =
3741 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3742 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3743 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3744
3745 // Populate the non-atomic reduction using the elementwise reduction function.
3746 // This loads the elements from the global and private variables and reduces
3747 // them before storing back the result to the global variable.
3748 Builder.SetInsertPoint(NonAtomicRedBlock);
3749 for (auto En : enumerate(ReductionInfos)) {
3750 const ReductionInfo &RI = En.value();
3752 // We have one less load for by-ref case because that load is now inside of
3753 // the reduction region
3754 Value *RedValue = RI.Variable;
3755 if (!IsByRef[En.index()]) {
3756 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3757 "red.value." + Twine(En.index()));
3758 }
3759 Value *PrivateRedValue =
3761 "red.private.value." + Twine(En.index()));
3762 Value *Reduced;
3763 InsertPointOrErrorTy AfterIP =
3764 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3765 if (!AfterIP)
3766 return AfterIP.takeError();
3767 Builder.restoreIP(*AfterIP);
3768
3769 if (!Builder.GetInsertBlock())
3770 return InsertPointTy();
3771 // for by-ref case, the load is inside of the reduction region
3772 if (!IsByRef[En.index()])
3773 Builder.CreateStore(Reduced, RI.Variable);
3774 }
3775 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3776 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3777 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3778 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3779 Builder.CreateBr(ContinuationBlock);
3780
3781 // Populate the atomic reduction using the atomic elementwise reduction
3782 // function. There are no loads/stores here because they will be happening
3783 // inside the atomic elementwise reduction.
3784 Builder.SetInsertPoint(AtomicRedBlock);
3785 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3786 for (const ReductionInfo &RI : ReductionInfos) {
3789 if (!AfterIP)
3790 return AfterIP.takeError();
3791 Builder.restoreIP(*AfterIP);
3792 if (!Builder.GetInsertBlock())
3793 return InsertPointTy();
3794 }
3795 Builder.CreateBr(ContinuationBlock);
3796 } else {
3798 }
3799
3800 // Populate the outlined reduction function using the elementwise reduction
3801 // function. Partial values are extracted from the type-erased array of
3802 // pointers to private variables.
3803 BasicBlock *ReductionFuncBlock =
3804 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3805 Builder.SetInsertPoint(ReductionFuncBlock);
3806 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3807 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3808
3809 for (auto En : enumerate(ReductionInfos)) {
3810 const ReductionInfo &RI = En.value();
3812 RedArrayTy, LHSArrayPtr, 0, En.index());
3813 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3814 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3815 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3817 RedArrayTy, RHSArrayPtr, 0, En.index());
3818 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3819 Value *RHSPtr =
3821 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3822 Value *Reduced;
3823 InsertPointOrErrorTy AfterIP =
3824 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3825 if (!AfterIP)
3826 return AfterIP.takeError();
3827 Builder.restoreIP(*AfterIP);
3828 if (!Builder.GetInsertBlock())
3829 return InsertPointTy();
3830 // store is inside of the reduction region when using by-ref
3831 if (!IsByRef[En.index()])
3832 Builder.CreateStore(Reduced, LHSPtr);
3833 }
3835
3836 Builder.SetInsertPoint(ContinuationBlock);
3837 return Builder.saveIP();
3838}
3839
3842 BodyGenCallbackTy BodyGenCB,
3843 FinalizeCallbackTy FiniCB) {
3844 if (!updateToLocation(Loc))
3845 return Loc.IP;
3846
3847 Directive OMPD = Directive::OMPD_master;
3848 uint32_t SrcLocStrSize;
3849 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3850 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3851 Value *ThreadId = getOrCreateThreadID(Ident);
3852 Value *Args[] = {Ident, ThreadId};
3853
3854 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3855 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3856
3857 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3858 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3859
3860 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3861 /*Conditional*/ true, /*hasFinalize*/ true);
3862}
3863
3866 BodyGenCallbackTy BodyGenCB,
3867 FinalizeCallbackTy FiniCB, Value *Filter) {
3868 if (!updateToLocation(Loc))
3869 return Loc.IP;
3870
3871 Directive OMPD = Directive::OMPD_masked;
3872 uint32_t SrcLocStrSize;
3873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3874 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3875 Value *ThreadId = getOrCreateThreadID(Ident);
3876 Value *Args[] = {Ident, ThreadId, Filter};
3877 Value *ArgsEnd[] = {Ident, ThreadId};
3878
3879 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3880 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3881
3882 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3883 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3884
3885 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3886 /*Conditional*/ true, /*hasFinalize*/ true);
3887}
3888
3890 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3891 BasicBlock *PostInsertBefore, const Twine &Name) {
3892 Module *M = F->getParent();
3893 LLVMContext &Ctx = M->getContext();
3894 Type *IndVarTy = TripCount->getType();
3895
3896 // Create the basic block structure.
3897 BasicBlock *Preheader =
3898 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3899 BasicBlock *Header =
3900 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3901 BasicBlock *Cond =
3902 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3903 BasicBlock *Body =
3904 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3905 BasicBlock *Latch =
3906 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3907 BasicBlock *Exit =
3908 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3909 BasicBlock *After =
3910 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3911
3912 // Use specified DebugLoc for new instructions.
3914
3915 Builder.SetInsertPoint(Preheader);
3916 Builder.CreateBr(Header);
3917
3918 Builder.SetInsertPoint(Header);
3919 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3920 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3922
3924 Value *Cmp =
3925 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3926 Builder.CreateCondBr(Cmp, Body, Exit);
3927
3928 Builder.SetInsertPoint(Body);
3929 Builder.CreateBr(Latch);
3930
3931 Builder.SetInsertPoint(Latch);
3932 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3933 "omp_" + Name + ".next", /*HasNUW=*/true);
3934 Builder.CreateBr(Header);
3935 IndVarPHI->addIncoming(Next, Latch);
3936
3937 Builder.SetInsertPoint(Exit);
3939
3940 // Remember and return the canonical control flow.
3941 LoopInfos.emplace_front();
3942 CanonicalLoopInfo *CL = &LoopInfos.front();
3943
3944 CL->Header = Header;
3945 CL->Cond = Cond;
3946 CL->Latch = Latch;
3947 CL->Exit = Exit;
3948
3949#ifndef NDEBUG
3950 CL->assertOK();
3951#endif
3952 return CL;
3953}
3954
3957 LoopBodyGenCallbackTy BodyGenCB,
3958 Value *TripCount, const Twine &Name) {
3959 BasicBlock *BB = Loc.IP.getBlock();
3960 BasicBlock *NextBB = BB->getNextNode();
3961
3962 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3963 NextBB, NextBB, Name);
3964 BasicBlock *After = CL->getAfter();
3965
3966 // If location is not set, don't connect the loop.
3967 if (updateToLocation(Loc)) {
3968 // Split the loop at the insertion point: Branch to the preheader and move
3969 // every following instruction to after the loop (the After BB). Also, the
3970 // new successor is the loop's after block.
3971 spliceBB(Builder, After, /*CreateBranch=*/false);
3973 }
3974
3975 // Emit the body content. We do it after connecting the loop to the CFG to
3976 // avoid that the callback encounters degenerate BBs.
3977 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
3978 return Err;
3979
3980#ifndef NDEBUG
3981 CL->assertOK();
3982#endif
3983 return CL;
3984}
3985
3987 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3988 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3989 InsertPointTy ComputeIP, const Twine &Name) {
3990
3991 // Consider the following difficulties (assuming 8-bit signed integers):
3992 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3993 // DO I = 1, 100, 50
3994 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3995 // DO I = 100, 0, -128
3996
3997 // Start, Stop and Step must be of the same integer type.
3998 auto *IndVarTy = cast<IntegerType>(Start->getType());
3999 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4000 assert(IndVarTy == Step->getType() && "Step type mismatch");
4001
4002 LocationDescription ComputeLoc =
4003 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4004 updateToLocation(ComputeLoc);
4005
4006 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4007 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4008
4009 // Like Step, but always positive.
4010 Value *Incr = Step;
4011
4012 // Distance between Start and Stop; always positive.
4013 Value *Span;
4014
4015 // Condition whether there are no iterations are executed at all, e.g. because
4016 // UB < LB.
4017 Value *ZeroCmp;
4018
4019 if (IsSigned) {
4020 // Ensure that increment is positive. If not, negate and invert LB and UB.
4021 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4022 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4023 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4024 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4025 Span = Builder.CreateSub(UB, LB, "", false, true);
4026 ZeroCmp = Builder.CreateICmp(
4027 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4028 } else {
4029 Span = Builder.CreateSub(Stop, Start, "", true);
4030 ZeroCmp = Builder.CreateICmp(
4031 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4032 }
4033
4034 Value *CountIfLooping;
4035 if (InclusiveStop) {
4036 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4037 } else {
4038 // Avoid incrementing past stop since it could overflow.
4039 Value *CountIfTwo = Builder.CreateAdd(
4040 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4041 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4042 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4043 }
4044 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4045 "omp_" + Name + ".tripcount");
4046
4047 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4048 Builder.restoreIP(CodeGenIP);
4049 Value *Span = Builder.CreateMul(IV, Step);
4050 Value *IndVar = Builder.CreateAdd(Span, Start);
4051 return BodyGenCB(Builder.saveIP(), IndVar);
4052 };
4053 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4054 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4055}
4056
4057// Returns an LLVM function to call for initializing loop bounds using OpenMP
4058// static scheduling depending on `type`. Only i32 and i64 are supported by the
4059// runtime. Always interpret integers as unsigned similarly to
4060// CanonicalLoopInfo.
4062 OpenMPIRBuilder &OMPBuilder) {
4063 unsigned Bitwidth = Ty->getIntegerBitWidth();
4064 if (Bitwidth == 32)
4065 return OMPBuilder.getOrCreateRuntimeFunction(
4066 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4067 if (Bitwidth == 64)
4068 return OMPBuilder.getOrCreateRuntimeFunction(
4069 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4070 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4071}
4072
4074OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4075 InsertPointTy AllocaIP,
4076 bool NeedsBarrier) {
4077 assert(CLI->isValid() && "Requires a valid canonical loop");
4078 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4079 "Require dedicated allocate IP");
4080
4081 // Set up the source location value for OpenMP runtime.
4084
4085 uint32_t SrcLocStrSize;
4086 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4087 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4088
4089 // Declare useful OpenMP runtime functions.
4090 Value *IV = CLI->getIndVar();
4091 Type *IVTy = IV->getType();
4092 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4093 FunctionCallee StaticFini =
4094 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4095
4096 // Allocate space for computed loop bounds as expected by the "init" function.
4097 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4098
4099 Type *I32Type = Type::getInt32Ty(M.getContext());
4100 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4101 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4102 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4103 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4104
4105 // At the end of the preheader, prepare for calling the "init" function by
4106 // storing the current loop bounds into the allocated space. A canonical loop
4107 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4108 // and produces an inclusive upper bound.
4110 Constant *Zero = ConstantInt::get(IVTy, 0);
4111 Constant *One = ConstantInt::get(IVTy, 1);
4112 Builder.CreateStore(Zero, PLowerBound);
4113 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4114 Builder.CreateStore(UpperBound, PUpperBound);
4115 Builder.CreateStore(One, PStride);
4116
4117 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4118
4119 Constant *SchedulingType = ConstantInt::get(
4120 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4121
4122 // Call the "init" function and update the trip count of the loop with the
4123 // value it produced.
4124 Builder.CreateCall(StaticInit,
4125 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4126 PUpperBound, PStride, One, Zero});
4127 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4128 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4129 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4130 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4131 CLI->setTripCount(TripCount);
4132
4133 // Update all uses of the induction variable except the one in the condition
4134 // block that compares it with the actual upper bound, and the increment in
4135 // the latch block.
4136
4137 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4139 CLI->getBody()->getFirstInsertionPt());
4141 return Builder.CreateAdd(OldIV, LowerBound);
4142 });
4143
4144 // In the "exit" block, call the "fini" function.
4146 CLI->getExit()->getTerminator()->getIterator());
4147 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4148
4149 // Add the barrier if requested.
4150 if (NeedsBarrier) {
4151 InsertPointOrErrorTy BarrierIP =
4152 createBarrier(LocationDescription(Builder.saveIP(), DL),
4153 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4154 /* CheckCancelFlag */ false);
4155 if (!BarrierIP)
4156 return BarrierIP.takeError();
4157 }
4158
4159 InsertPointTy AfterIP = CLI->getAfterIP();
4160 CLI->invalidate();
4161
4162 return AfterIP;
4163}
4164
4166OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4167 CanonicalLoopInfo *CLI,
4168 InsertPointTy AllocaIP,
4169 bool NeedsBarrier,
4170 Value *ChunkSize) {
4171 assert(CLI->isValid() && "Requires a valid canonical loop");
4172 assert(ChunkSize && "Chunk size is required");
4173
4174 LLVMContext &Ctx = CLI->getFunction()->getContext();
4175 Value *IV = CLI->getIndVar();
4176 Value *OrigTripCount = CLI->getTripCount();
4177 Type *IVTy = IV->getType();
4178 assert(IVTy->getIntegerBitWidth() <= 64 &&
4179 "Max supported tripcount bitwidth is 64 bits");
4180 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4181 : Type::getInt64Ty(Ctx);
4182 Type *I32Type = Type::getInt32Ty(M.getContext());
4183 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4184 Constant *One = ConstantInt::get(InternalIVTy, 1);
4185
4186 // Declare useful OpenMP runtime functions.
4187 FunctionCallee StaticInit =
4188 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4189 FunctionCallee StaticFini =
4190 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4191
4192 // Allocate space for computed loop bounds as expected by the "init" function.
4193 Builder.restoreIP(AllocaIP);
4195 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4196 Value *PLowerBound =
4197 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4198 Value *PUpperBound =
4199 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4200 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4201
4202 // Set up the source location value for the OpenMP runtime.
4205
4206 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4207 Value *CastedChunkSize =
4208 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4209 Value *CastedTripCount =
4210 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4211
4212 Constant *SchedulingType = ConstantInt::get(
4213 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4214 Builder.CreateStore(Zero, PLowerBound);
4215 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4216 Builder.CreateStore(OrigUpperBound, PUpperBound);
4217 Builder.CreateStore(One, PStride);
4218
4219 // Call the "init" function and update the trip count of the loop with the
4220 // value it produced.
4221 uint32_t SrcLocStrSize;
4222 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4223 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4224 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4225 Builder.CreateCall(StaticInit,
4226 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4227 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4228 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4229 /*pstride=*/PStride, /*incr=*/One,
4230 /*chunk=*/CastedChunkSize});
4231
4232 // Load values written by the "init" function.
4233 Value *FirstChunkStart =
4234 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4235 Value *FirstChunkStop =
4236 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4237 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4238 Value *ChunkRange =
4239 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4240 Value *NextChunkStride =
4241 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4242
4243 // Create outer "dispatch" loop for enumerating the chunks.
4244 BasicBlock *DispatchEnter = splitBB(Builder, true);
4245 Value *DispatchCounter;
4246
4247 // It is safe to assume this didn't return an error because the callback
4248 // passed into createCanonicalLoop is the only possible error source, and it
4249 // always returns success.
4251 {Builder.saveIP(), DL},
4252 [&](InsertPointTy BodyIP, Value *Counter) {
4253 DispatchCounter = Counter;
4254 return Error::success();
4255 },
4256 FirstChunkStart, CastedTripCount, NextChunkStride,
4257 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4258 "dispatch"));
4259
4260 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4261 // not have to preserve the canonical invariant.
4262 BasicBlock *DispatchBody = DispatchCLI->getBody();
4263 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4264 BasicBlock *DispatchExit = DispatchCLI->getExit();
4265 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4266 DispatchCLI->invalidate();
4267
4268 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4269 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4270 redirectTo(CLI->getExit(), DispatchLatch, DL);
4271 redirectTo(DispatchBody, DispatchEnter, DL);
4272
4273 // Prepare the prolog of the chunk loop.
4276
4277 // Compute the number of iterations of the chunk loop.
4279 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4280 Value *IsLastChunk =
4281 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4282 Value *CountUntilOrigTripCount =
4283 Builder.CreateSub(CastedTripCount, DispatchCounter);
4284 Value *ChunkTripCount = Builder.CreateSelect(
4285 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4286 Value *BackcastedChunkTC =
4287 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4288 CLI->setTripCount(BackcastedChunkTC);
4289
4290 // Update all uses of the induction variable except the one in the condition
4291 // block that compares it with the actual upper bound, and the increment in
4292 // the latch block.
4293 Value *BackcastedDispatchCounter =
4294 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4295 CLI->mapIndVar([&](Instruction *) -> Value * {
4296 Builder.restoreIP(CLI->getBodyIP());
4297 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4298 });
4299
4300 // In the "exit" block, call the "fini" function.
4301 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4302 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4303
4304 // Add the barrier if requested.
4305 if (NeedsBarrier) {
4306 InsertPointOrErrorTy AfterIP =
4307 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4308 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4309 if (!AfterIP)
4310 return AfterIP.takeError();
4311 }
4312
4313#ifndef NDEBUG
4314 // Even though we currently do not support applying additional methods to it,
4315 // the chunk loop should remain a canonical loop.
4316 CLI->assertOK();
4317#endif
4318
4319 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4320}
4321
4322// Returns an LLVM function to call for executing an OpenMP static worksharing
4323// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4324// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4325static FunctionCallee
4327 WorksharingLoopType LoopType) {
4328 unsigned Bitwidth = Ty->getIntegerBitWidth();
4329 Module &M = OMPBuilder->M;
4330 switch (LoopType) {
4331 case WorksharingLoopType::ForStaticLoop:
4332 if (Bitwidth == 32)
4333 return OMPBuilder->getOrCreateRuntimeFunction(
4334 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4335 if (Bitwidth == 64)
4336 return OMPBuilder->getOrCreateRuntimeFunction(
4337 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4338 break;
4339 case WorksharingLoopType::DistributeStaticLoop:
4340 if (Bitwidth == 32)
4341 return OMPBuilder->getOrCreateRuntimeFunction(
4342 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4343 if (Bitwidth == 64)
4344 return OMPBuilder->getOrCreateRuntimeFunction(
4345 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4346 break;
4347 case WorksharingLoopType::DistributeForStaticLoop:
4348 if (Bitwidth == 32)
4349 return OMPBuilder->getOrCreateRuntimeFunction(
4350 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4351 if (Bitwidth == 64)
4352 return OMPBuilder->getOrCreateRuntimeFunction(
4353 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4354 break;
4355 }
4356 if (Bitwidth != 32 && Bitwidth != 64) {
4357 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4358 }
4359 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4360}
4361
4362// Inserts a call to proper OpenMP Device RTL function which handles
4363// loop worksharing.
4365 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4366 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4367 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4368 Type *TripCountTy = TripCount->getType();
4369 Module &M = OMPBuilder->M;
4370 IRBuilder<> &Builder = OMPBuilder->Builder;
4371 FunctionCallee RTLFn =
4372 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4373 SmallVector<Value *, 8> RealArgs;
4374 RealArgs.push_back(Ident);
4375 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4376 RealArgs.push_back(LoopBodyArg);
4377 RealArgs.push_back(TripCount);
4378 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4379 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4380 Builder.CreateCall(RTLFn, RealArgs);
4381 return;
4382 }
4383 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4384 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4385 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4386 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4387
4388 RealArgs.push_back(
4389 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4390 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4391 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4392 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4393 }
4394
4395 Builder.CreateCall(RTLFn, RealArgs);
4396}
4397
4398static void
4400 CanonicalLoopInfo *CLI, Value *Ident,
4401 Function &OutlinedFn, Type *ParallelTaskPtr,
4402 const SmallVector<Instruction *, 4> &ToBeDeleted,
4403 WorksharingLoopType LoopType) {
4404 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4405 BasicBlock *Preheader = CLI->getPreheader();
4406 Value *TripCount = CLI->getTripCount();
4407
4408 // After loop body outling, the loop body contains only set up
4409 // of loop body argument structure and the call to the outlined
4410 // loop body function. Firstly, we need to move setup of loop body args
4411 // into loop preheader.
4412 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4413 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4414
4415 // The next step is to remove the whole loop. We do not it need anymore.
4416 // That's why make an unconditional branch from loop preheader to loop
4417 // exit block
4418 Builder.restoreIP({Preheader, Preheader->end()});
4419 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4420 Preheader->getTerminator()->eraseFromParent();
4421 Builder.CreateBr(CLI->getExit());
4422
4423 // Delete dead loop blocks
4424 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4425 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4426 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4427 CleanUpInfo.EntryBB = CLI->getHeader();
4428 CleanUpInfo.ExitBB = CLI->getExit();
4429 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4430 DeleteDeadBlocks(BlocksToBeRemoved);
4431
4432 // Find the instruction which corresponds to loop body argument structure
4433 // and remove the call to loop body function instruction.
4434 Value *LoopBodyArg;
4435 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4436 assert(OutlinedFnUser &&
4437 "Expected unique undroppable user of outlined function");
4438 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4439 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4440 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4441 "Expected outlined function call to be located in loop preheader");
4442 // Check in case no argument structure has been passed.
4443 if (OutlinedFnCallInstruction->arg_size() > 1)
4444 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4445 else
4446 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4447 OutlinedFnCallInstruction->eraseFromParent();
4448
4449 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4450 LoopBodyArg, ParallelTaskPtr, TripCount,
4451 OutlinedFn);
4452
4453 for (auto &ToBeDeletedItem : ToBeDeleted)
4454 ToBeDeletedItem->eraseFromParent();
4455 CLI->invalidate();
4456}
4457
4459OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4460 InsertPointTy AllocaIP,
4461 WorksharingLoopType LoopType) {
4462 uint32_t SrcLocStrSize;
4463 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4464 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4465
4466 OutlineInfo OI;
4467 OI.OuterAllocaBB = CLI->getPreheader();
4468 Function *OuterFn = CLI->getPreheader()->getParent();
4469
4470 // Instructions which need to be deleted at the end of code generation
4472
4473 OI.OuterAllocaBB = AllocaIP.getBlock();
4474
4475 // Mark the body loop as region which needs to be extracted
4476 OI.EntryBB = CLI->getBody();
4477 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4478 "omp.prelatch", true);
4479
4480 // Prepare loop body for extraction
4481 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4482
4483 // Insert new loop counter variable which will be used only in loop
4484 // body.
4485 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4486 Instruction *NewLoopCntLoad =
4487 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4488 // New loop counter instructions are redundant in the loop preheader when
4489 // code generation for workshare loop is finshed. That's why mark them as
4490 // ready for deletion.
4491 ToBeDeleted.push_back(NewLoopCntLoad);
4492 ToBeDeleted.push_back(NewLoopCnt);
4493
4494 // Analyse loop body region. Find all input variables which are used inside
4495 // loop body region.
4496 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4498 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4499 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4500 ParallelRegionBlockSet.end());
4501
4502 CodeExtractorAnalysisCache CEAC(*OuterFn);
4503 CodeExtractor Extractor(Blocks,
4504 /* DominatorTree */ nullptr,
4505 /* AggregateArgs */ true,
4506 /* BlockFrequencyInfo */ nullptr,
4507 /* BranchProbabilityInfo */ nullptr,
4508 /* AssumptionCache */ nullptr,
4509 /* AllowVarArgs */ true,
4510 /* AllowAlloca */ true,
4511 /* AllocationBlock */ CLI->getPreheader(),
4512 /* Suffix */ ".omp_wsloop",
4513 /* AggrArgsIn0AddrSpace */ true);
4514
4515 BasicBlock *CommonExit = nullptr;
4516 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4517
4518 // Find allocas outside the loop body region which are used inside loop
4519 // body
4520 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4521
4522 // We need to model loop body region as the function f(cnt, loop_arg).
4523 // That's why we replace loop induction variable by the new counter
4524 // which will be one of loop body function argument
4526 CLI->getIndVar()->user_end());
4527 for (auto Use : Users) {
4528 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4529 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4530 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4531 }
4532 }
4533 }
4534 // Make sure that loop counter variable is not merged into loop body
4535 // function argument structure and it is passed as separate variable
4536 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4537
4538 // PostOutline CB is invoked when loop body function is outlined and
4539 // loop body is replaced by call to outlined function. We need to add
4540 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4541 // function will handle loop control logic.
4542 //
4543 OI.PostOutlineCB = [=, ToBeDeletedVec =
4544 std::move(ToBeDeleted)](Function &OutlinedFn) {
4545 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4546 ToBeDeletedVec, LoopType);
4547 };
4548 addOutlineInfo(std::move(OI));
4549 return CLI->getAfterIP();
4550}
4551
4554 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4555 bool HasSimdModifier, bool HasMonotonicModifier,
4556 bool HasNonmonotonicModifier, bool HasOrderedClause,
4557 WorksharingLoopType LoopType) {
4558 if (Config.isTargetDevice())
4559 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4560 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4561 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4562 HasNonmonotonicModifier, HasOrderedClause);
4563
4564 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4565 OMPScheduleType::ModifierOrdered;
4566 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4567 case OMPScheduleType::BaseStatic:
4568 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4569 if (IsOrdered)
4570 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4571 NeedsBarrier, ChunkSize);
4572 // FIXME: Monotonicity ignored?
4573 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4574
4575 case OMPScheduleType::BaseStaticChunked:
4576 if (IsOrdered)
4577 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4578 NeedsBarrier, ChunkSize);
4579 // FIXME: Monotonicity ignored?
4580 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4581 ChunkSize);
4582
4583 case OMPScheduleType::BaseRuntime:
4584 case OMPScheduleType::BaseAuto:
4585 case OMPScheduleType::BaseGreedy:
4586 case OMPScheduleType::BaseBalanced:
4587 case OMPScheduleType::BaseSteal:
4588 case OMPScheduleType::BaseGuidedSimd:
4589 case OMPScheduleType::BaseRuntimeSimd:
4590 assert(!ChunkSize &&
4591 "schedule type does not support user-defined chunk sizes");
4592 [[fallthrough]];
4593 case OMPScheduleType::BaseDynamicChunked:
4594 case OMPScheduleType::BaseGuidedChunked:
4595 case OMPScheduleType::BaseGuidedIterativeChunked:
4596 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4597 case OMPScheduleType::BaseStaticBalancedChunked:
4598 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4599 NeedsBarrier, ChunkSize);
4600
4601 default:
4602 llvm_unreachable("Unknown/unimplemented schedule kind");
4603 }
4604}
4605
4606/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4607/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4608/// the runtime. Always interpret integers as unsigned similarly to
4609/// CanonicalLoopInfo.
4610static FunctionCallee
4612 unsigned Bitwidth = Ty->getIntegerBitWidth();
4613 if (Bitwidth == 32)
4614 return OMPBuilder.getOrCreateRuntimeFunction(
4615 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4616 if (Bitwidth == 64)
4617 return OMPBuilder.getOrCreateRuntimeFunction(
4618 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4619 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4620}
4621
4622/// Returns an LLVM function to call for updating the next loop using OpenMP
4623/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4624/// the runtime. Always interpret integers as unsigned similarly to
4625/// CanonicalLoopInfo.
4626static FunctionCallee
4628 unsigned Bitwidth = Ty->getIntegerBitWidth();
4629 if (Bitwidth == 32)
4630 return OMPBuilder.getOrCreateRuntimeFunction(
4631 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4632 if (Bitwidth == 64)
4633 return OMPBuilder.getOrCreateRuntimeFunction(
4634 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4635 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4636}
4637
4638/// Returns an LLVM function to call for finalizing the dynamic loop using
4639/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4640/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4641static FunctionCallee
4643 unsigned Bitwidth = Ty->getIntegerBitWidth();
4644 if (Bitwidth == 32)
4645 return OMPBuilder.getOrCreateRuntimeFunction(
4646 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4647 if (Bitwidth == 64)
4648 return OMPBuilder.getOrCreateRuntimeFunction(
4649 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4650 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4651}
4652
4654OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4655 InsertPointTy AllocaIP,
4656 OMPScheduleType SchedType,
4657 bool NeedsBarrier, Value *Chunk) {
4658 assert(CLI->isValid() && "Requires a valid canonical loop");
4659 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4660 "Require dedicated allocate IP");
4662 "Require valid schedule type");
4663
4664 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4665 OMPScheduleType::ModifierOrdered;
4666
4667 // Set up the source location value for OpenMP runtime.
4669
4670 uint32_t SrcLocStrSize;
4671 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4672 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4673
4674 // Declare useful OpenMP runtime functions.
4675 Value *IV = CLI->getIndVar();
4676 Type *IVTy = IV->getType();
4677 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4678 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4679
4680 // Allocate space for computed loop bounds as expected by the "init" function.
4681 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4682 Type *I32Type = Type::getInt32Ty(M.getContext());
4683 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4684 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4685 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4686 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4687
4688 // At the end of the preheader, prepare for calling the "init" function by
4689 // storing the current loop bounds into the allocated space. A canonical loop
4690 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4691 // and produces an inclusive upper bound.
4692 BasicBlock *PreHeader = CLI->getPreheader();
4693 Builder.SetInsertPoint(PreHeader->getTerminator());
4694 Constant *One = ConstantInt::get(IVTy, 1);
4695 Builder.CreateStore(One, PLowerBound);
4696 Value *UpperBound = CLI->getTripCount();
4697 Builder.CreateStore(UpperBound, PUpperBound);
4698 Builder.CreateStore(One, PStride);
4699
4700 BasicBlock *Header = CLI->getHeader();
4701 BasicBlock *Exit = CLI->getExit();
4702 BasicBlock *Cond = CLI->getCond();
4703 BasicBlock *Latch = CLI->getLatch();
4704 InsertPointTy AfterIP = CLI->getAfterIP();
4705
4706 // The CLI will be "broken" in the code below, as the loop is no longer
4707 // a valid canonical loop.
4708
4709 if (!Chunk)
4710 Chunk = One;
4711
4712 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4713
4714 Constant *SchedulingType =
4715 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4716
4717 // Call the "init" function.
4718 Builder.CreateCall(DynamicInit,
4719 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4720 UpperBound, /* step */ One, Chunk});
4721
4722 // An outer loop around the existing one.
4723 BasicBlock *OuterCond = BasicBlock::Create(
4724 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4725 PreHeader->getParent());
4726 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4727 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4728 Value *Res =
4729 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4730 PLowerBound, PUpperBound, PStride});
4731 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4732 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4733 Value *LowerBound =
4734 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4735 Builder.CreateCondBr(MoreWork, Header, Exit);
4736
4737 // Change PHI-node in loop header to use outer cond rather than preheader,
4738 // and set IV to the LowerBound.
4739 Instruction *Phi = &Header->front();
4740 auto *PI = cast<PHINode>(Phi);
4741 PI->setIncomingBlock(0, OuterCond);
4742 PI->setIncomingValue(0, LowerBound);
4743
4744 // Then set the pre-header to jump to the OuterCond
4745 Instruction *Term = PreHeader->getTerminator();
4746 auto *Br = cast<BranchInst>(Term);
4747 Br->setSuccessor(0, OuterCond);
4748
4749 // Modify the inner condition:
4750 // * Use the UpperBound returned from the DynamicNext call.
4751 // * jump to the loop outer loop when done with one of the inner loops.
4752 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4753 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4755 auto *CI = cast<CmpInst>(Comp);
4756 CI->setOperand(1, UpperBound);
4757 // Redirect the inner exit to branch to outer condition.
4758 Instruction *Branch = &Cond->back();
4759 auto *BI = cast<BranchInst>(Branch);
4760 assert(BI->getSuccessor(1) == Exit);
4761 BI->setSuccessor(1, OuterCond);
4762
4763 // Call the "fini" function if "ordered" is present in wsloop directive.
4764 if (Ordered) {
4765 Builder.SetInsertPoint(&Latch->back());
4766 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4767 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4768 }
4769
4770 // Add the barrier if requested.
4771 if (NeedsBarrier) {
4772 Builder.SetInsertPoint(&Exit->back());
4773 InsertPointOrErrorTy BarrierIP =
4774 createBarrier(LocationDescription(Builder.saveIP(), DL),
4775 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4776 /* CheckCancelFlag */ false);
4777 if (!BarrierIP)
4778 return BarrierIP.takeError();
4779 }
4780
4781 CLI->invalidate();
4782 return AfterIP;
4783}
4784
4785/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4786/// after this \p OldTarget will be orphaned.
4788 BasicBlock *NewTarget, DebugLoc DL) {
4789 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4790 redirectTo(Pred, NewTarget, DL);
4791}
4792
4793/// Determine which blocks in \p BBs are reachable from outside and remove the
4794/// ones that are not reachable from the function.
4796 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4797 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4798 for (Use &U : BB->uses()) {
4799 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4800 if (!UseInst)
4801 continue;
4802 if (BBsToErase.count(UseInst->getParent()))
4803 continue;
4804 return true;
4805 }
4806 return false;
4807 };
4808
4809 while (BBsToErase.remove_if(HasRemainingUses)) {
4810 // Try again if anything was removed.
4811 }
4812
4813 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4814 DeleteDeadBlocks(BBVec);
4815}
4816
4819 InsertPointTy ComputeIP) {
4820 assert(Loops.size() >= 1 && "At least one loop required");
4821 size_t NumLoops = Loops.size();
4822
4823 // Nothing to do if there is already just one loop.
4824 if (NumLoops == 1)
4825 return Loops.front();
4826
4827 CanonicalLoopInfo *Outermost = Loops.front();
4828 CanonicalLoopInfo *Innermost = Loops.back();
4829 BasicBlock *OrigPreheader = Outermost->getPreheader();
4830 BasicBlock *OrigAfter = Outermost->getAfter();
4831 Function *F = OrigPreheader->getParent();
4832
4833 // Loop control blocks that may become orphaned later.
4834 SmallVector<BasicBlock *, 12> OldControlBBs;
4835 OldControlBBs.reserve(6 * Loops.size());
4837 Loop->collectControlBlocks(OldControlBBs);
4838
4839 // Setup the IRBuilder for inserting the trip count computation.
4841 if (ComputeIP.isSet())
4842 Builder.restoreIP(ComputeIP);
4843 else
4844 Builder.restoreIP(Outermost->getPreheaderIP());
4845
4846 // Derive the collapsed' loop trip count.
4847 // TODO: Find common/largest indvar type.
4848 Value *CollapsedTripCount = nullptr;
4849 for (CanonicalLoopInfo *L : Loops) {
4850 assert(L->isValid() &&
4851 "All loops to collapse must be valid canonical loops");
4852 Value *OrigTripCount = L->getTripCount();
4853 if (!CollapsedTripCount) {
4854 CollapsedTripCount = OrigTripCount;
4855 continue;
4856 }
4857
4858 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4859 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4860 {}, /*HasNUW=*/true);
4861 }
4862
4863 // Create the collapsed loop control flow.
4864 CanonicalLoopInfo *Result =
4865 createLoopSkeleton(DL, CollapsedTripCount, F,
4866 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4867
4868 // Build the collapsed loop body code.
4869 // Start with deriving the input loop induction variables from the collapsed
4870 // one, using a divmod scheme. To preserve the original loops' order, the
4871 // innermost loop use the least significant bits.
4872 Builder.restoreIP(Result->getBodyIP());
4873
4874 Value *Leftover = Result->getIndVar();
4875 SmallVector<Value *> NewIndVars;
4876 NewIndVars.resize(NumLoops);
4877 for (int i = NumLoops - 1; i >= 1; --i) {
4878 Value *OrigTripCount = Loops[i]->getTripCount();
4879
4880 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4881 NewIndVars[i] = NewIndVar;
4882
4883 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4884 }
4885 // Outermost loop gets all the remaining bits.
4886 NewIndVars[0] = Leftover;
4887
4888 // Construct the loop body control flow.
4889 // We progressively construct the branch structure following in direction of
4890 // the control flow, from the leading in-between code, the loop nest body, the
4891 // trailing in-between code, and rejoining the collapsed loop's latch.
4892 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4893 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4894 // its predecessors as sources.
4895 BasicBlock *ContinueBlock = Result->getBody();
4896 BasicBlock *ContinuePred = nullptr;
4897 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4898 BasicBlock *NextSrc) {
4899 if (ContinueBlock)
4900 redirectTo(ContinueBlock, Dest, DL);
4901 else
4902 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4903
4904 ContinueBlock = nullptr;
4905 ContinuePred = NextSrc;
4906 };
4907
4908 // The code before the nested loop of each level.
4909 // Because we are sinking it into the nest, it will be executed more often
4910 // that the original loop. More sophisticated schemes could keep track of what
4911 // the in-between code is and instantiate it only once per thread.
4912 for (size_t i = 0; i < NumLoops - 1; ++i)
4913 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4914
4915 // Connect the loop nest body.
4916 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4917
4918 // The code after the nested loop at each level.
4919 for (size_t i = NumLoops - 1; i > 0; --i)
4920 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4921
4922 // Connect the finished loop to the collapsed loop latch.
4923 ContinueWith(Result->getLatch(), nullptr);
4924
4925 // Replace the input loops with the new collapsed loop.
4926 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4927 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4928
4929 // Replace the input loop indvars with the derived ones.
4930 for (size_t i = 0; i < NumLoops; ++i)
4931 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4932
4933 // Remove unused parts of the input loops.
4934 removeUnusedBlocksFromParent(OldControlBBs);
4935
4936 for (CanonicalLoopInfo *L : Loops)
4937 L->invalidate();
4938
4939#ifndef NDEBUG
4940 Result->assertOK();
4941#endif
4942 return Result;
4943}
4944
4945std::vector<CanonicalLoopInfo *>
4947 ArrayRef<Value *> TileSizes) {
4948 assert(TileSizes.size() == Loops.size() &&
4949 "Must pass as many tile sizes as there are loops");
4950 int NumLoops = Loops.size();
4951 assert(NumLoops >= 1 && "At least one loop to tile required");
4952
4953 CanonicalLoopInfo *OutermostLoop = Loops.front();
4954 CanonicalLoopInfo *InnermostLoop = Loops.back();
4955 Function *F = OutermostLoop->getBody()->getParent();
4956 BasicBlock *InnerEnter = InnermostLoop->getBody();
4957 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4958
4959 // Loop control blocks that may become orphaned later.
4960 SmallVector<BasicBlock *, 12> OldControlBBs;
4961 OldControlBBs.reserve(6 * Loops.size());
4963 Loop->collectControlBlocks(OldControlBBs);
4964
4965 // Collect original trip counts and induction variable to be accessible by
4966 // index. Also, the structure of the original loops is not preserved during
4967 // the construction of the tiled loops, so do it before we scavenge the BBs of
4968 // any original CanonicalLoopInfo.
4969 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4970 for (CanonicalLoopInfo *L : Loops) {
4971 assert(L->isValid() && "All input loops must be valid canonical loops");
4972 OrigTripCounts.push_back(L->getTripCount());
4973 OrigIndVars.push_back(L->getIndVar());
4974 }
4975
4976 // Collect the code between loop headers. These may contain SSA definitions
4977 // that are used in the loop nest body. To be usable with in the innermost
4978 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4979 // these instructions may be executed more often than before the tiling.
4980 // TODO: It would be sufficient to only sink them into body of the
4981 // corresponding tile loop.
4983 for (int i = 0; i < NumLoops - 1; ++i) {
4984 CanonicalLoopInfo *Surrounding = Loops[i];
4985 CanonicalLoopInfo *Nested = Loops[i + 1];
4986
4987 BasicBlock *EnterBB = Surrounding->getBody();
4988 BasicBlock *ExitBB = Nested->getHeader();
4989 InbetweenCode.emplace_back(EnterBB, ExitBB);
4990 }
4991
4992 // Compute the trip counts of the floor loops.
4994 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4995 SmallVector<Value *, 4> FloorCount, FloorRems;
4996 for (int i = 0; i < NumLoops; ++i) {
4997 Value *TileSize = TileSizes[i];
4998 Value *OrigTripCount = OrigTripCounts[i];
4999 Type *IVType = OrigTripCount->getType();
5000
5001 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5002 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5003
5004 // 0 if tripcount divides the tilesize, 1 otherwise.
5005 // 1 means we need an additional iteration for a partial tile.
5006 //
5007 // Unfortunately we cannot just use the roundup-formula
5008 // (tripcount + tilesize - 1)/tilesize
5009 // because the summation might overflow. We do not want introduce undefined
5010 // behavior when the untiled loop nest did not.
5011 Value *FloorTripOverflow =
5012 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5013
5014 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5015 FloorTripCount =
5016 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5017 "omp_floor" + Twine(i) + ".tripcount", true);
5018
5019 // Remember some values for later use.
5020 FloorCount.push_back(FloorTripCount);
5021 FloorRems.push_back(FloorTripRem);
5022 }
5023
5024 // Generate the new loop nest, from the outermost to the innermost.
5025 std::vector<CanonicalLoopInfo *> Result;
5026 Result.reserve(NumLoops * 2);
5027
5028 // The basic block of the surrounding loop that enters the nest generated
5029 // loop.
5030 BasicBlock *Enter = OutermostLoop->getPreheader();
5031
5032 // The basic block of the surrounding loop where the inner code should
5033 // continue.
5034 BasicBlock *Continue = OutermostLoop->getAfter();
5035
5036 // Where the next loop basic block should be inserted.
5037 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5038
5039 auto EmbeddNewLoop =
5040 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5041 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5042 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5043 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5044 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5045 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5046
5047 // Setup the position where the next embedded loop connects to this loop.
5048 Enter = EmbeddedLoop->getBody();
5049 Continue = EmbeddedLoop->getLatch();
5050 OutroInsertBefore = EmbeddedLoop->getLatch();
5051 return EmbeddedLoop;
5052 };
5053
5054 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5055 const Twine &NameBase) {
5056 for (auto P : enumerate(TripCounts)) {
5057 CanonicalLoopInfo *EmbeddedLoop =
5058 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5059 Result.push_back(EmbeddedLoop);
5060 }
5061 };
5062
5063 EmbeddNewLoops(FloorCount, "floor");
5064
5065 // Within the innermost floor loop, emit the code that computes the tile
5066 // sizes.
5068 SmallVector<Value *, 4> TileCounts;
5069 for (int i = 0; i < NumLoops; ++i) {
5070 CanonicalLoopInfo *FloorLoop = Result[i];
5071 Value *TileSize = TileSizes[i];
5072
5073 Value *FloorIsEpilogue =
5074 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5075 Value *TileTripCount =
5076 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5077
5078 TileCounts.push_back(TileTripCount);
5079 }
5080
5081 // Create the tile loops.
5082 EmbeddNewLoops(TileCounts, "tile");
5083
5084 // Insert the inbetween code into the body.
5085 BasicBlock *BodyEnter = Enter;
5086 BasicBlock *BodyEntered = nullptr;
5087 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5088 BasicBlock *EnterBB = P.first;
5089 BasicBlock *ExitBB = P.second;
5090
5091 if (BodyEnter)
5092 redirectTo(BodyEnter, EnterBB, DL);
5093 else
5094 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5095
5096 BodyEnter = nullptr;
5097 BodyEntered = ExitBB;
5098 }
5099
5100 // Append the original loop nest body into the generated loop nest body.
5101 if (BodyEnter)
5102 redirectTo(BodyEnter, InnerEnter, DL);
5103 else
5104 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5106
5107 // Replace the original induction variable with an induction variable computed
5108 // from the tile and floor induction variables.
5109 Builder.restoreIP(Result.back()->getBodyIP());
5110 for (int i = 0; i < NumLoops; ++i) {
5111 CanonicalLoopInfo *FloorLoop = Result[i];
5112 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5113 Value *OrigIndVar = OrigIndVars[i];
5114 Value *Size = TileSizes[i];
5115
5116 Value *Scale =
5117 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5118 Value *Shift =
5119 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5120 OrigIndVar->replaceAllUsesWith(Shift);
5121 }
5122
5123 // Remove unused parts of the original loops.
5124 removeUnusedBlocksFromParent(OldControlBBs);
5125
5126 for (CanonicalLoopInfo *L : Loops)
5127 L->invalidate();
5128
5129#ifndef NDEBUG
5130 for (CanonicalLoopInfo *GenL : Result)
5131 GenL->assertOK();
5132#endif
5133 return Result;
5134}
5135
5136/// Attach metadata \p Properties to the basic block described by \p BB. If the
5137/// basic block already has metadata, the basic block properties are appended.
5139 ArrayRef<Metadata *> Properties) {
5140 // Nothing to do if no property to attach.
5141 if (Properties.empty())
5142 return;
5143
5144 LLVMContext &Ctx = BB->getContext();
5145 SmallVector<Metadata *> NewProperties;
5146 NewProperties.push_back(nullptr);
5147
5148 // If the basic block already has metadata, prepend it to the new metadata.
5149 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5150 if (Existing)
5151 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5152
5153 append_range(NewProperties, Properties);
5154 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5155 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5156
5157 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5158}
5159
5160/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5161/// loop already has metadata, the loop properties are appended.
5163 ArrayRef<Metadata *> Properties) {
5164 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5165
5166 // Attach metadata to the loop's latch
5167 BasicBlock *Latch = Loop->getLatch();
5168 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5169 addBasicBlockMetadata(Latch, Properties);
5170}
5171
5172/// Attach llvm.access.group metadata to the memref instructions of \p Block
5173static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5174 LoopInfo &LI) {
5175 for (Instruction &I : *Block) {
5176 if (I.mayReadOrWriteMemory()) {
5177 // TODO: This instruction may already have access group from
5178 // other pragmas e.g. #pragma clang loop vectorize. Append
5179 // so that the existing metadata is not overwritten.
5180 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5181 }
5182 }
5183}
5184
5188 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5189 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5190}
5191
5195 Loop, {
5196 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5197 });
5198}
5199
5200void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5201 Value *IfCond, ValueToValueMapTy &VMap,
5202 const Twine &NamePrefix) {
5203 Function *F = CanonicalLoop->getFunction();
5204
5205 // Define where if branch should be inserted
5206 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5207
5208 // TODO: We should not rely on pass manager. Currently we use pass manager
5209 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5210 // object. We should have a method which returns all blocks between
5211 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5213 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5214 FAM.registerPass([]() { return LoopAnalysis(); });
5215 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5216
5217 // Get the loop which needs to be cloned
5218 LoopAnalysis LIA;
5219 LoopInfo &&LI = LIA.run(*F, FAM);
5220 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5221
5222 // Create additional blocks for the if statement
5223 BasicBlock *Head = SplitBefore->getParent();
5224 Instruction *HeadOldTerm = Head->getTerminator();
5225 llvm::LLVMContext &C = Head->getContext();
5227 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5229 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5230
5231 // Create if condition branch.
5232 Builder.SetInsertPoint(HeadOldTerm);
5233 Instruction *BrInstr =
5234 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5235 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5236 // Then block contains branch to omp loop which needs to be vectorized
5237 spliceBB(IP, ThenBlock, false);
5238 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5239
5240 Builder.SetInsertPoint(ElseBlock);
5241
5242 // Clone loop for the else branch
5244
5245 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5246 for (BasicBlock *Block : L->getBlocks()) {
5247 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5248 NewBB->moveBefore(CanonicalLoop->getExit());
5249 VMap[Block] = NewBB;
5250 NewBlocks.push_back(NewBB);
5251 }
5252 remapInstructionsInBlocks(NewBlocks, VMap);
5253 Builder.CreateBr(NewBlocks.front());
5254}
5255
5256unsigned
5258 const StringMap<bool> &Features) {
5259 if (TargetTriple.isX86()) {
5260 if (Features.lookup("avx512f"))
5261 return 512;
5262 else if (Features.lookup("avx"))
5263 return 256;
5264 return 128;
5265 }
5266 if (TargetTriple.isPPC())
5267 return 128;
5268 if (TargetTriple.isWasm())
5269 return 128;
5270 return 0;
5271}
5272
5274 MapVector<Value *, Value *> AlignedVars,
5275 Value *IfCond, OrderKind Order,
5276 ConstantInt *Simdlen, ConstantInt *Safelen) {
5278
5279 Function *F = CanonicalLoop->getFunction();
5280
5281 // TODO: We should not rely on pass manager. Currently we use pass manager
5282 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5283 // object. We should have a method which returns all blocks between
5284 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5286 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5287 FAM.registerPass([]() { return LoopAnalysis(); });
5288 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5289
5290 LoopAnalysis LIA;
5291 LoopInfo &&LI = LIA.run(*F, FAM);
5292
5293 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5294 if (AlignedVars.size()) {
5296 for (auto &AlignedItem : AlignedVars) {
5297 Value *AlignedPtr = AlignedItem.first;
5298 Value *Alignment = AlignedItem.second;
5299 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5300 Builder.SetInsertPoint(loadInst->getNextNode());
5301 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5302 AlignedPtr, Alignment);
5303 }
5304 Builder.restoreIP(IP);
5305 }
5306
5307 if (IfCond) {
5308 ValueToValueMapTy VMap;
5309 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5310 // Add metadata to the cloned loop which disables vectorization
5311 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5312 assert(MappedLatch &&
5313 "Cannot find value which corresponds to original loop latch");
5314 assert(isa<BasicBlock>(MappedLatch) &&
5315 "Cannot cast mapped latch block value to BasicBlock");
5316 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5317 ConstantAsMetadata *BoolConst =
5320 NewLatchBlock,
5321 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5322 BoolConst})});
5323 }
5324
5325 SmallSet<BasicBlock *, 8> Reachable;
5326
5327 // Get the basic blocks from the loop in which memref instructions
5328 // can be found.
5329 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5330 // preferably without running any passes.
5331 for (BasicBlock *Block : L->getBlocks()) {
5332 if (Block == CanonicalLoop->getCond() ||
5333 Block == CanonicalLoop->getHeader())
5334 continue;
5335 Reachable.insert(Block);
5336 }
5337
5338 SmallVector<Metadata *> LoopMDList;
5339
5340 // In presence of finite 'safelen', it may be unsafe to mark all
5341 // the memory instructions parallel, because loop-carried
5342 // dependences of 'safelen' iterations are possible.
5343 // If clause order(concurrent) is specified then the memory instructions
5344 // are marked parallel even if 'safelen' is finite.
5345 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5346 // Add access group metadata to memory-access instructions.
5347 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5348 for (BasicBlock *BB : Reachable)
5349 addSimdMetadata(BB, AccessGroup, LI);
5350 // TODO: If the loop has existing parallel access metadata, have
5351 // to combine two lists.
5352 LoopMDList.push_back(MDNode::get(
5353 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5354 }
5355
5356 // Use the above access group metadata to create loop level
5357 // metadata, which should be distinct for each loop.
5358 ConstantAsMetadata *BoolConst =
5360 LoopMDList.push_back(MDNode::get(
5361 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5362
5363 if (Simdlen || Safelen) {
5364 // If both simdlen and safelen clauses are specified, the value of the
5365 // simdlen parameter must be less than or equal to the value of the safelen
5366 // parameter. Therefore, use safelen only in the absence of simdlen.
5367 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5368 LoopMDList.push_back(
5369 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5370 ConstantAsMetadata::get(VectorizeWidth)}));
5371 }
5372
5373 addLoopMetadata(CanonicalLoop, LoopMDList);
5374}
5375
5376/// Create the TargetMachine object to query the backend for optimization
5377/// preferences.
5378///
5379/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5380/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5381/// needed for the LLVM pass pipline. We use some default options to avoid
5382/// having to pass too many settings from the frontend that probably do not
5383/// matter.
5384///
5385/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5386/// method. If we are going to use TargetMachine for more purposes, especially
5387/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5388/// might become be worth requiring front-ends to pass on their TargetMachine,
5389/// or at least cache it between methods. Note that while fontends such as Clang
5390/// have just a single main TargetMachine per translation unit, "target-cpu" and
5391/// "target-features" that determine the TargetMachine are per-function and can
5392/// be overrided using __attribute__((target("OPTIONS"))).
5393static std::unique_ptr<TargetMachine>
5395 Module *M = F->getParent();
5396
5397 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5398 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5399 const std::string &Triple = M->getTargetTriple();
5400
5401 std::string Error;
5403 if (!TheTarget)
5404 return {};
5405
5407 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5408 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5409 /*CodeModel=*/std::nullopt, OptLevel));
5410}
5411
5412/// Heuristically determine the best-performant unroll factor for \p CLI. This
5413/// depends on the target processor. We are re-using the same heuristics as the
5414/// LoopUnrollPass.
5416 Function *F = CLI->getFunction();
5417
5418 // Assume the user requests the most aggressive unrolling, even if the rest of
5419 // the code is optimized using a lower setting.
5421 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5422
5424 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5425 FAM.registerPass([]() { return AssumptionAnalysis(); });
5426 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5427 FAM.registerPass([]() { return LoopAnalysis(); });
5428 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5429 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5430 TargetIRAnalysis TIRA;
5431 if (TM)
5432 TIRA = TargetIRAnalysis(
5433 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5434 FAM.registerPass([&]() { return TIRA; });
5435
5436 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5438 ScalarEvolution &&SE = SEA.run(*F, FAM);
5440 DominatorTree &&DT = DTA.run(*F, FAM);
5441 LoopAnalysis LIA;
5442 LoopInfo &&LI = LIA.run(*F, FAM);
5444 AssumptionCache &&AC = ACT.run(*F, FAM);
5446
5447 Loop *L = LI.getLoopFor(CLI->getHeader());
5448 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5449
5452 /*BlockFrequencyInfo=*/nullptr,
5453 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5454 /*UserThreshold=*/std::nullopt,
5455 /*UserCount=*/std::nullopt,
5456 /*UserAllowPartial=*/true,
5457 /*UserAllowRuntime=*/true,
5458 /*UserUpperBound=*/std::nullopt,
5459 /*UserFullUnrollMaxCount=*/std::nullopt);
5460
5461 UP.Force = true;
5462
5463 // Account for additional optimizations taking place before the LoopUnrollPass
5464 // would unroll the loop.
5467
5468 // Use normal unroll factors even if the rest of the code is optimized for
5469 // size.
5472
5473 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5474 << " Threshold=" << UP.Threshold << "\n"
5475 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5476 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5477 << " PartialOptSizeThreshold="
5478 << UP.PartialOptSizeThreshold << "\n");
5479
5480 // Disable peeling.
5483 /*UserAllowPeeling=*/false,
5484 /*UserAllowProfileBasedPeeling=*/false,
5485 /*UnrollingSpecficValues=*/false);
5486
5488 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5489
5490 // Assume that reads and writes to stack variables can be eliminated by
5491 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5492 // size.
5493 for (BasicBlock *BB : L->blocks()) {
5494 for (Instruction &I : *BB) {
5495 Value *Ptr;
5496 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5497 Ptr = Load->getPointerOperand();
5498 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5499 Ptr = Store->getPointerOperand();
5500 } else
5501 continue;
5502
5503 Ptr = Ptr->stripPointerCasts();
5504
5505 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5506 if (Alloca->getParent() == &F->getEntryBlock())
5507 EphValues.insert(&I);
5508 }
5509 }
5510 }
5511
5512 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5513
5514 // Loop is not unrollable if the loop contains certain instructions.
5515 if (!UCE.canUnroll()) {
5516 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5517 return 1;
5518 }
5519
5520 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5521 << "\n");
5522
5523 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5524 // be able to use it.
5525 int TripCount = 0;
5526 int MaxTripCount = 0;
5527 bool MaxOrZero = false;
5528 unsigned TripMultiple = 0;
5529
5530 bool UseUpperBound = false;
5531 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5532 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5533 UseUpperBound);
5534 unsigned Factor = UP.Count;
5535 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5536
5537 // This function returns 1 to signal to not unroll a loop.
5538 if (Factor == 0)
5539 return 1;
5540 return Factor;
5541}
5542
5544 int32_t Factor,
5545 CanonicalLoopInfo **UnrolledCLI) {
5546 assert(Factor >= 0 && "Unroll factor must not be negative");
5547
5548 Function *F = Loop->getFunction();
5549 LLVMContext &Ctx = F->getContext();
5550
5551 // If the unrolled loop is not used for another loop-associated directive, it
5552 // is sufficient to add metadata for the LoopUnrollPass.
5553 if (!UnrolledCLI) {
5554 SmallVector<Metadata *, 2> LoopMetadata;
5555 LoopMetadata.push_back(
5556 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5557
5558 if (Factor >= 1) {
5560 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5561 LoopMetadata.push_back(MDNode::get(
5562 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5563 }
5564
5565 addLoopMetadata(Loop, LoopMetadata);
5566 return;
5567 }
5568
5569 // Heuristically determine the unroll factor.
5570 if (Factor == 0)
5572
5573 // No change required with unroll factor 1.
5574 if (Factor == 1) {
5575 *UnrolledCLI = Loop;
5576 return;
5577 }
5578
5579 assert(Factor >= 2 &&
5580 "unrolling only makes sense with a factor of 2 or larger");
5581
5582 Type *IndVarTy = Loop->getIndVarType();
5583
5584 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5585 // unroll the inner loop.
5586 Value *FactorVal =
5587 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5588 /*isSigned=*/false));
5589 std::vector<CanonicalLoopInfo *> LoopNest =
5590 tileLoops(DL, {Loop}, {FactorVal});
5591 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5592 *UnrolledCLI = LoopNest[0];
5593 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5594
5595 // LoopUnrollPass can only fully unroll loops with constant trip count.
5596 // Unroll by the unroll factor with a fallback epilog for the remainder
5597 // iterations if necessary.
5599 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5601 InnerLoop,
5602 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5604 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5605
5606#ifndef NDEBUG
5607 (*UnrolledCLI)->assertOK();
5608#endif
5609}
5610
5613 llvm::Value *BufSize, llvm::Value *CpyBuf,
5614 llvm::Value *CpyFn, llvm::Value *DidIt) {
5615 if (!updateToLocation(Loc))
5616 return Loc.IP;
5617
5618 uint32_t SrcLocStrSize;
5619 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5620 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5621 Value *ThreadId = getOrCreateThreadID(Ident);
5622
5623 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5624
5625 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5626
5627 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5628 Builder.CreateCall(Fn, Args);
5629
5630 return Builder.saveIP();
5631}
5632
5634 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5635 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5637
5638 if (!updateToLocation(Loc))
5639 return Loc.IP;
5640
5641 // If needed allocate and initialize `DidIt` with 0.
5642 // DidIt: flag variable: 1=single thread; 0=not single thread.
5643 llvm::Value *DidIt = nullptr;
5644 if (!CPVars.empty()) {
5647 }
5648
5649 Directive OMPD = Directive::OMPD_single;
5650 uint32_t SrcLocStrSize;
5651 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5652 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5653 Value *ThreadId = getOrCreateThreadID(Ident);
5654 Value *Args[] = {Ident, ThreadId};
5655
5656 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5657 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5658
5659 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5660 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5661
5662 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5663 if (Error Err = FiniCB(IP))
5664 return Err;
5665
5666 // The thread that executes the single region must set `DidIt` to 1.
5667 // This is used by __kmpc_copyprivate, to know if the caller is the
5668 // single thread or not.
5669 if (DidIt)
5671
5672 return Error::success();
5673 };
5674
5675 // generates the following:
5676 // if (__kmpc_single()) {
5677 // .... single region ...
5678 // __kmpc_end_single
5679 // }
5680 // __kmpc_copyprivate
5681 // __kmpc_barrier
5682
5683 InsertPointOrErrorTy AfterIP =
5684 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5685 /*Conditional*/ true,
5686 /*hasFinalize*/ true);
5687 if (!AfterIP)
5688 return AfterIP.takeError();
5689
5690 if (DidIt) {
5691 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5692 // NOTE BufSize is currently unused, so just pass 0.
5694 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5695 CPFuncs[I], DidIt);
5696 // NOTE __kmpc_copyprivate already inserts a barrier
5697 } else if (!IsNowait) {
5698 InsertPointOrErrorTy AfterIP =
5700 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5701 /* CheckCancelFlag */ false);
5702 if (!AfterIP)
5703 return AfterIP.takeError();
5704 }
5705 return Builder.saveIP();
5706}
5707
5709 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5710 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5711
5712 if (!updateToLocation(Loc))
5713 return Loc.IP;
5714
5715 Directive OMPD = Directive::OMPD_critical;
5716 uint32_t SrcLocStrSize;
5717 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5718 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5719 Value *ThreadId = getOrCreateThreadID(Ident);
5720 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5721 Value *Args[] = {Ident, ThreadId, LockVar};
5722
5723 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5724 Function *RTFn = nullptr;
5725 if (HintInst) {
5726 // Add Hint to entry Args and create call
5727 EnterArgs.push_back(HintInst);
5728 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5729 } else {
5730 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5731 }
5732 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5733
5734 Function *ExitRTLFn =
5735 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5736 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5737
5738 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5739 /*Conditional*/ false, /*hasFinalize*/ true);
5740}
5741
5744 InsertPointTy AllocaIP, unsigned NumLoops,
5745 ArrayRef<llvm::Value *> StoreValues,
5746 const Twine &Name, bool IsDependSource) {
5747 assert(
5748 llvm::all_of(StoreValues,
5749 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5750 "OpenMP runtime requires depend vec with i64 type");
5751
5752 if (!updateToLocation(Loc))
5753 return Loc.IP;
5754
5755 // Allocate space for vector and generate alloc instruction.
5756 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5757 Builder.restoreIP(AllocaIP);
5758 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5759 ArgsBase->setAlignment(Align(8));
5760 Builder.restoreIP(Loc.IP);
5761
5762 // Store the index value with offset in depend vector.
5763 for (unsigned I = 0; I < NumLoops; ++I) {
5764 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5765 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5766 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5767 STInst->setAlignment(Align(8));
5768 }
5769
5770 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5771 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5772
5773 uint32_t SrcLocStrSize;
5774 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5775 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5776 Value *ThreadId = getOrCreateThreadID(Ident);
5777 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5778
5779 Function *RTLFn = nullptr;
5780 if (IsDependSource)
5781 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5782 else
5783 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5784 Builder.CreateCall(RTLFn, Args);
5785
5786 return Builder.saveIP();
5787}
5788
5790 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5791 FinalizeCallbackTy FiniCB, bool IsThreads) {
5792 if (!updateToLocation(Loc))
5793 return Loc.IP;
5794
5795 Directive OMPD = Directive::OMPD_ordered;
5796 Instruction *EntryCall = nullptr;
5797 Instruction *ExitCall = nullptr;
5798
5799 if (IsThreads) {
5800 uint32_t SrcLocStrSize;
5801 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5802 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5803 Value *ThreadId = getOrCreateThreadID(Ident);
5804 Value *Args[] = {Ident, ThreadId};
5805
5806 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5807 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5808
5809 Function *ExitRTLFn =
5810 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5811 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5812 }
5813
5814 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5815 /*Conditional*/ false, /*hasFinalize*/ true);
5816}
5817
5818OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5819 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5820 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5821 bool HasFinalize, bool IsCancellable) {
5822
5823 if (HasFinalize)
5824 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5825
5826 // Create inlined region's entry and body blocks, in preparation
5827 // for conditional creation
5828 BasicBlock *EntryBB = Builder.GetInsertBlock();
5829 Instruction *SplitPos = EntryBB->getTerminator();
5830 if (!isa_and_nonnull<BranchInst>(SplitPos))
5831 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5832 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5833 BasicBlock *FiniBB =
5834 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5835
5837 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5838
5839 // generate body
5840 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5841 /* CodeGenIP */ Builder.saveIP()))
5842 return Err;
5843
5844 // emit exit call and do any needed finalization.
5845 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5846 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5847 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5848 "Unexpected control flow graph state!!");
5849 InsertPointOrErrorTy AfterIP =
5850 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5851 if (!AfterIP)
5852 return AfterIP.takeError();
5853 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5854 "Unexpected Control Flow State!");
5856
5857 // If we are skipping the region of a non conditional, remove the exit
5858 // block, and clear the builder's insertion point.
5859 assert(SplitPos->getParent() == ExitBB &&
5860 "Unexpected Insertion point location!");
5861 auto merged = MergeBlockIntoPredecessor(ExitBB);
5862 BasicBlock *ExitPredBB = SplitPos->getParent();
5863 auto InsertBB = merged ? ExitPredBB : ExitBB;
5864 if (!isa_and_nonnull<BranchInst>(SplitPos))
5865 SplitPos->eraseFromParent();
5866 Builder.SetInsertPoint(InsertBB);
5867
5868 return Builder.saveIP();
5869}
5870
5871OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5872 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5873 // if nothing to do, Return current insertion point.
5874 if (!Conditional || !EntryCall)
5875 return Builder.saveIP();
5876
5877 BasicBlock *EntryBB = Builder.GetInsertBlock();
5878 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5879 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5880 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5881
5882 // Emit thenBB and set the Builder's insertion point there for
5883 // body generation next. Place the block after the current block.
5884 Function *CurFn = EntryBB->getParent();
5885 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5886
5887 // Move Entry branch to end of ThenBB, and replace with conditional
5888 // branch (If-stmt)
5889 Instruction *EntryBBTI = EntryBB->getTerminator();
5890 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5891 EntryBBTI->removeFromParent();
5893 Builder.Insert(EntryBBTI);
5894 UI->eraseFromParent();
5896
5897 // return an insertion point to ExitBB.
5898 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5899}
5900
5901OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5902 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5903 bool HasFinalize) {
5904
5905 Builder.restoreIP(FinIP);
5906
5907 // If there is finalization to do, emit it before the exit call
5908 if (HasFinalize) {
5909 assert(!FinalizationStack.empty() &&
5910 "Unexpected finalization stack state!");
5911
5912 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5913 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5914
5915 if (Error Err = Fi.FiniCB(FinIP))
5916 return Err;
5917
5918 BasicBlock *FiniBB = FinIP.getBlock();
5919 Instruction *FiniBBTI = FiniBB->getTerminator();
5920
5921 // set Builder IP for call creation
5922 Builder.SetInsertPoint(FiniBBTI);
5923 }
5924
5925 if (!ExitCall)
5926 return Builder.saveIP();
5927
5928 // place the Exitcall as last instruction before Finalization block terminator
5929 ExitCall->removeFromParent();
5930 Builder.Insert(ExitCall);
5931
5932 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5933 ExitCall->getIterator());
5934}
5935
5937 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5938 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5939 if (!IP.isSet())
5940 return IP;
5941
5943
5944 // creates the following CFG structure
5945 // OMP_Entry : (MasterAddr != PrivateAddr)?
5946 // F T
5947 // | \
5948 // | copin.not.master
5949 // | /
5950 // v /
5951 // copyin.not.master.end
5952 // |
5953 // v
5954 // OMP.Entry.Next
5955
5956 BasicBlock *OMP_Entry = IP.getBlock();
5957 Function *CurFn = OMP_Entry->getParent();
5958 BasicBlock *CopyBegin =
5959 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5960 BasicBlock *CopyEnd = nullptr;
5961
5962 // If entry block is terminated, split to preserve the branch to following
5963 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5964 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5965 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5966 "copyin.not.master.end");
5967 OMP_Entry->getTerminator()->eraseFromParent();
5968 } else {
5969 CopyEnd =
5970 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5971 }
5972
5973 Builder.SetInsertPoint(OMP_Entry);
5974 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5975 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5976 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5977 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5978
5979 Builder.SetInsertPoint(CopyBegin);
5980 if (BranchtoEnd)
5982
5983 return Builder.saveIP();
5984}
5985
5987 Value *Size, Value *Allocator,
5988 std::string Name) {
5990 updateToLocation(Loc);
5991
5992 uint32_t SrcLocStrSize;
5993 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5994 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5995 Value *ThreadId = getOrCreateThreadID(Ident);
5996 Value *Args[] = {ThreadId, Size, Allocator};
5997
5998 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
5999
6000 return Builder.CreateCall(Fn, Args, Name);
6001}
6002
6004 Value *Addr, Value *Allocator,
6005 std::string Name) {
6007 updateToLocation(Loc);
6008
6009 uint32_t SrcLocStrSize;
6010 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6011 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6012 Value *ThreadId = getOrCreateThreadID(Ident);
6013 Value *Args[] = {ThreadId, Addr, Allocator};
6014 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6015 return Builder.CreateCall(Fn, Args, Name);
6016}
6017
6019 const LocationDescription &Loc, Value *InteropVar,
6020 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6021 Value *DependenceAddress, bool HaveNowaitClause) {
6023 updateToLocation(Loc);
6024
6025 uint32_t SrcLocStrSize;
6026 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6027 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6028 Value *ThreadId = getOrCreateThreadID(Ident);
6029 if (Device == nullptr)
6031 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6032 if (NumDependences == nullptr) {
6033 NumDependences = ConstantInt::get(Int32, 0);
6034 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6035 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6036 }
6037 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6038 Value *Args[] = {
6039 Ident, ThreadId, InteropVar, InteropTypeVal,
6040 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6041
6042 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6043
6044 return Builder.CreateCall(Fn, Args);
6045}
6046
6048 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6049 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6051 updateToLocation(Loc);
6052
6053 uint32_t SrcLocStrSize;
6054 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6055 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6056 Value *ThreadId = getOrCreateThreadID(Ident);
6057 if (Device == nullptr)
6059 if (NumDependences == nullptr) {
6060 NumDependences = ConstantInt::get(Int32, 0);
6061 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6062 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6063 }
6064 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6065 Value *Args[] = {
6066 Ident, ThreadId, InteropVar, Device,
6067 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6068
6069 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6070
6071 return Builder.CreateCall(Fn, Args);
6072}
6073
6075 Value *InteropVar, Value *Device,
6076 Value *NumDependences,
6077 Value *DependenceAddress,
6078 bool HaveNowaitClause) {
6080 updateToLocation(Loc);
6081 uint32_t SrcLocStrSize;
6082 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6083 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6084 Value *ThreadId = getOrCreateThreadID(Ident);
6085 if (Device == nullptr)
6087 if (NumDependences == nullptr) {
6088 NumDependences = ConstantInt::get(Int32, 0);
6089 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6090 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6091 }
6092 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6093 Value *Args[] = {
6094 Ident, ThreadId, InteropVar, Device,
6095 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6096
6097 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6098
6099 return Builder.CreateCall(Fn, Args);
6100}
6101
6103 const LocationDescription &Loc, llvm::Value *Pointer,
6106 updateToLocation(Loc);
6107
6108 uint32_t SrcLocStrSize;
6109 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6110 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6111 Value *ThreadId = getOrCreateThreadID(Ident);
6112 Constant *ThreadPrivateCache =
6113 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6114 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6115
6116 Function *Fn =
6117 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6118
6119 return Builder.CreateCall(Fn, Args);
6120}
6121
6124 int32_t MinThreadsVal, int32_t MaxThreadsVal,
6125 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
6126 if (!updateToLocation(Loc))
6127 return Loc.IP;
6128
6129 uint32_t SrcLocStrSize;
6130 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6131 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6132 Constant *IsSPMDVal = ConstantInt::getSigned(
6134 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
6135 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6136 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6137
6138 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6139 Function *Kernel = DebugKernelWrapper;
6140
6141 // We need to strip the debug prefix to get the correct kernel name.
6142 StringRef KernelName = Kernel->getName();
6143 const std::string DebugPrefix = "_debug__";
6144 if (KernelName.ends_with(DebugPrefix)) {
6145 KernelName = KernelName.drop_back(DebugPrefix.length());
6146 Kernel = M.getFunction(KernelName);
6147 assert(Kernel && "Expected the real kernel to exist");
6148 }
6149
6150 // Manifest the launch configuration in the metadata matching the kernel
6151 // environment.
6152 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
6153 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
6154
6155 // For max values, < 0 means unset, == 0 means set but unknown.
6156 if (MaxThreadsVal < 0)
6157 MaxThreadsVal = std::max(
6158 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
6159
6160 if (MaxThreadsVal > 0)
6161 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
6162
6163 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
6165 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
6166 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
6167 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6168 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6169
6171 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6172 const DataLayout &DL = Fn->getDataLayout();
6173
6174 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6175 Constant *DynamicEnvironmentInitializer =
6176 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6177 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6178 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6179 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6180 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6181 DL.getDefaultGlobalsAddressSpace());
6182 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6183
6184 Constant *DynamicEnvironment =
6185 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6186 ? DynamicEnvironmentGV
6187 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6188 DynamicEnvironmentPtr);
6189
6190 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6191 ConfigurationEnvironment, {
6192 UseGenericStateMachineVal,
6193 MayUseNestedParallelismVal,
6194 IsSPMDVal,
6195 MinThreads,
6196 MaxThreads,
6197 MinTeams,
6198 MaxTeams,
6199 ReductionDataSize,
6200 ReductionBufferLength,
6201 });
6202 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6203 KernelEnvironment, {
6204 ConfigurationEnvironmentInitializer,
6205 Ident,
6206 DynamicEnvironment,
6207 });
6208 std::string KernelEnvironmentName =
6209 (KernelName + "_kernel_environment").str();
6210 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6211 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6212 KernelEnvironmentInitializer, KernelEnvironmentName,
6213 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6214 DL.getDefaultGlobalsAddressSpace());
6215 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6216
6217 Constant *KernelEnvironment =
6218 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6219 ? KernelEnvironmentGV
6220 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6221 KernelEnvironmentPtr);
6222 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6223 CallInst *ThreadKind =
6224 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6225
6226 Value *ExecUserCode = Builder.CreateICmpEQ(
6227 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6228 "exec_user_code");
6229
6230 // ThreadKind = __kmpc_target_init(...)
6231 // if (ThreadKind == -1)
6232 // user_code
6233 // else
6234 // return;
6235
6236 auto *UI = Builder.CreateUnreachable();
6237 BasicBlock *CheckBB = UI->getParent();
6238 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6239
6240 BasicBlock *WorkerExitBB = BasicBlock::Create(
6241 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6242 Builder.SetInsertPoint(WorkerExitBB);
6244
6245 auto *CheckBBTI = CheckBB->getTerminator();
6246 Builder.SetInsertPoint(CheckBBTI);
6247 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6248
6249 CheckBBTI->eraseFromParent();
6250 UI->eraseFromParent();
6251
6252 // Continue in the "user_code" block, see diagram above and in
6253 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6254 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6255}
6256
6258 int32_t TeamsReductionDataSize,
6259 int32_t TeamsReductionBufferLength) {
6260 if (!updateToLocation(Loc))
6261 return;
6262
6264 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6265
6266 Builder.CreateCall(Fn, {});
6267
6268 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6269 return;
6270
6272 // We need to strip the debug prefix to get the correct kernel name.
6273 StringRef KernelName = Kernel->getName();
6274 const std::string DebugPrefix = "_debug__";
6275 if (KernelName.ends_with(DebugPrefix))
6276 KernelName = KernelName.drop_back(DebugPrefix.length());
6277 auto *KernelEnvironmentGV =
6278 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6279 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6280 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6281 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6282 KernelEnvironmentInitializer,
6283 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6284 NewInitializer = ConstantFoldInsertValueInstruction(
6285 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6286 {0, 8});
6287 KernelEnvironmentGV->setInitializer(NewInitializer);
6288}
6289
6291 Module &M = *Kernel.getParent();
6292 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6293 for (auto *Op : MD->operands()) {
6294 if (Op->getNumOperands() != 3)
6295 continue;
6296 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6297 if (!KernelOp || KernelOp->getValue() != &Kernel)
6298 continue;
6299 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6300 if (!Prop || Prop->getString() != Name)
6301 continue;
6302 return Op;
6303 }
6304 return nullptr;
6305}
6306
6308 bool Min) {
6309 // Update the "maxntidx" metadata for NVIDIA, or add it.
6310 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6311 if (ExistingOp) {
6312 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6313 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6314 ExistingOp->replaceOperandWith(
6315 2, ConstantAsMetadata::get(ConstantInt::get(
6316 OldVal->getValue()->getType(),
6317 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6318 } else {
6319 LLVMContext &Ctx = Kernel.getContext();
6321 MDString::get(Ctx, Name),
6323 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6324 // Append metadata to nvvm.annotations
6325 Module &M = *Kernel.getParent();
6326 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6327 MD->addOperand(MDNode::get(Ctx, MDVals));
6328 }
6329}
6330
6331std::pair<int32_t, int32_t>
6333 int32_t ThreadLimit =
6334 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6335
6336 if (T.isAMDGPU()) {
6337 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6338 if (!Attr.isValid() || !Attr.isStringAttribute())
6339 return {0, ThreadLimit};
6340 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6341 int32_t LB, UB;
6342 if (!llvm::to_integer(UBStr, UB, 10))
6343 return {0, ThreadLimit};
6344 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6345 if (!llvm::to_integer(LBStr, LB, 10))
6346 return {0, UB};
6347 return {LB, UB};
6348 }
6349
6350 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6351 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6352 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6353 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6354 }
6355 return {0, ThreadLimit};
6356}
6357
6359 Function &Kernel, int32_t LB,
6360 int32_t UB) {
6361 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6362
6363 if (T.isAMDGPU()) {
6364 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6365 llvm::utostr(LB) + "," + llvm::utostr(UB));
6366 return;
6367 }
6368
6369 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6370}
6371
6372std::pair<int32_t, int32_t>
6374 // TODO: Read from backend annotations if available.
6375 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6376}
6377
6379 int32_t LB, int32_t UB) {
6380 if (T.isNVPTX())
6381 if (UB > 0)
6382 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6383 if (T.isAMDGPU())
6384 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6385
6386 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6387}
6388
6389void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6390 Function *OutlinedFn) {
6391 if (Config.isTargetDevice()) {
6393 // TODO: Determine if DSO local can be set to true.
6394 OutlinedFn->setDSOLocal(false);
6396 if (T.isAMDGCN())
6398 }
6399}
6400
6401Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6402 StringRef EntryFnIDName) {
6403 if (Config.isTargetDevice()) {
6404 assert(OutlinedFn && "The outlined function must exist if embedded");
6405 return OutlinedFn;
6406 }
6407
6408 return new GlobalVariable(
6409 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6410 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6411}
6412
6413Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6414 StringRef EntryFnName) {
6415 if (OutlinedFn)
6416 return OutlinedFn;
6417
6418 assert(!M.getGlobalVariable(EntryFnName, true) &&
6419 "Named kernel already exists?");
6420 return new GlobalVariable(
6421 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6422 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6423}
6424
6426 TargetRegionEntryInfo &EntryInfo,
6427 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6428 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6429
6430 SmallString<64> EntryFnName;
6431 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6432
6434 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6435 if (!CBResult)
6436 return CBResult.takeError();
6437 OutlinedFn = *CBResult;
6438 } else {
6439 OutlinedFn = nullptr;
6440 }
6441
6442 // If this target outline function is not an offload entry, we don't need to
6443 // register it. This may be in the case of a false if clause, or if there are
6444 // no OpenMP targets.
6445 if (!IsOffloadEntry)
6446 return Error::success();
6447
6448 std::string EntryFnIDName =
6450 ? std::string(EntryFnName)
6451 : createPlatformSpecificName({EntryFnName, "region_id"});
6452
6453 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6454 EntryFnName, EntryFnIDName);
6455 return Error::success();
6456}
6457
6459 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6460 StringRef EntryFnName, StringRef EntryFnIDName) {
6461 if (OutlinedFn)
6462 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6463 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6464 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6466 EntryInfo, EntryAddr, OutlinedFnID,
6468 return OutlinedFnID;
6469}
6470
6472 const LocationDescription &Loc, InsertPointTy AllocaIP,
6473 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6474 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6475 omp::RuntimeFunction *MapperFunc,
6477 BodyGenTy BodyGenType)>
6478 BodyGenCB,
6479 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6480 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6481 if (!updateToLocation(Loc))
6482 return InsertPointTy();
6483
6484 Builder.restoreIP(CodeGenIP);
6485 // Disable TargetData CodeGen on Device pass.
6486 if (Config.IsTargetDevice.value_or(false)) {
6487 if (BodyGenCB) {
6488 InsertPointOrErrorTy AfterIP =
6489 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6490 if (!AfterIP)
6491 return AfterIP.takeError();
6492 Builder.restoreIP(*AfterIP);
6493 }
6494 return Builder.saveIP();
6495 }
6496
6497 bool IsStandAlone = !BodyGenCB;
6498 MapInfosTy *MapInfo;
6499 // Generate the code for the opening of the data environment. Capture all the
6500 // arguments of the runtime call by reference because they are used in the
6501 // closing of the region.
6502 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6503 InsertPointTy CodeGenIP) -> Error {
6504 MapInfo = &GenMapInfoCB(Builder.saveIP());
6505 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6506 /*IsNonContiguous=*/true, DeviceAddrCB,
6507 CustomMapperCB);
6508
6509 TargetDataRTArgs RTArgs;
6511
6512 // Emit the number of elements in the offloading arrays.
6513 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6514
6515 // Source location for the ident struct
6516 if (!SrcLocInfo) {
6517 uint32_t SrcLocStrSize;
6518 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6519 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6520 }
6521
6522 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6523 SrcLocInfo, DeviceID,
6524 PointerNum, RTArgs.BasePointersArray,
6525 RTArgs.PointersArray, RTArgs.SizesArray,
6526 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6527 RTArgs.MappersArray};
6528
6529 if (IsStandAlone) {
6530 assert(MapperFunc && "MapperFunc missing for standalone target data");
6531
6532 auto TaskBodyCB = [&](Value *, Value *,
6534 if (Info.HasNoWait) {
6535 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6539 }
6540
6542 OffloadingArgs);
6543
6544 if (Info.HasNoWait) {
6545 BasicBlock *OffloadContBlock =
6546 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6548 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6550 }
6551 return Error::success();
6552 };
6553
6554 bool RequiresOuterTargetTask = Info.HasNoWait;
6555 if (!RequiresOuterTargetTask)
6556 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6557 /*TargetTaskAllocaIP=*/{}));
6558 else
6559 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6560 /*Dependencies=*/{}, Info.HasNoWait));
6561 } else {
6562 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6563 omp::OMPRTL___tgt_target_data_begin_mapper);
6564
6565 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6566
6567 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6568 if (isa<AllocaInst>(DeviceMap.second.second)) {
6569 auto *LI =
6570 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6571 Builder.CreateStore(LI, DeviceMap.second.second);
6572 }
6573 }
6574
6575 // If device pointer privatization is required, emit the body of the
6576 // region here. It will have to be duplicated: with and without
6577 // privatization.
6578 InsertPointOrErrorTy AfterIP =
6579 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6580 if (!AfterIP)
6581 return AfterIP.takeError();
6582 Builder.restoreIP(*AfterIP);
6583 }
6584 return Error::success();
6585 };
6586
6587 // If we need device pointer privatization, we need to emit the body of the
6588 // region with no privatization in the 'else' branch of the conditional.
6589 // Otherwise, we don't have to do anything.
6590 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6591 InsertPointTy CodeGenIP) -> Error {
6592 InsertPointOrErrorTy AfterIP =
6593 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6594 if (!AfterIP)
6595 return AfterIP.takeError();
6596 Builder.restoreIP(*AfterIP);
6597 return Error::success();
6598 };
6599
6600 // Generate code for the closing of the data region.
6601 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6602 TargetDataRTArgs RTArgs;
6603 Info.EmitDebug = !MapInfo->Names.empty();
6604 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6605
6606 // Emit the number of elements in the offloading arrays.
6607 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6608
6609 // Source location for the ident struct
6610 if (!SrcLocInfo) {
6611 uint32_t SrcLocStrSize;
6612 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6613 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6614 }
6615
6616 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6617 PointerNum, RTArgs.BasePointersArray,
6618 RTArgs.PointersArray, RTArgs.SizesArray,
6619 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6620 RTArgs.MappersArray};
6621 Function *EndMapperFunc =
6622 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6623
6624 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6625 return Error::success();
6626 };
6627
6628 // We don't have to do anything to close the region if the if clause evaluates
6629 // to false.
6630 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6631 return Error::success();
6632 };
6633
6634 Error Err = [&]() -> Error {
6635 if (BodyGenCB) {
6636 Error Err = [&]() {
6637 if (IfCond)
6638 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6639 return BeginThenGen(AllocaIP, Builder.saveIP());
6640 }();
6641
6642 if (Err)
6643 return Err;
6644
6645 // If we don't require privatization of device pointers, we emit the body
6646 // in between the runtime calls. This avoids duplicating the body code.
6647 InsertPointOrErrorTy AfterIP =
6648 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6649 if (!AfterIP)
6650 return AfterIP.takeError();
6651 Builder.restoreIP(*AfterIP);
6652
6653 if (IfCond)
6654 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6655 return EndThenGen(AllocaIP, Builder.saveIP());
6656 }
6657 if (IfCond)
6658 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6659 return BeginThenGen(AllocaIP, Builder.saveIP());
6660 }();
6661
6662 if (Err)
6663 return Err;
6664
6665 return Builder.saveIP();
6666}
6667
6670 bool IsGPUDistribute) {
6671 assert((IVSize == 32 || IVSize == 64) &&
6672 "IV size is not compatible with the omp runtime");
6674 if (IsGPUDistribute)
6675 Name = IVSize == 32
6676 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6677 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6678 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6679 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6680 else
6681 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6682 : omp::OMPRTL___kmpc_for_static_init_4u)
6683 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6684 : omp::OMPRTL___kmpc_for_static_init_8u);
6685
6687}
6688
6690 bool IVSigned) {
6691 assert((IVSize == 32 || IVSize == 64) &&
6692 "IV size is not compatible with the omp runtime");
6693 RuntimeFunction Name = IVSize == 32
6694 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6695 : omp::OMPRTL___kmpc_dispatch_init_4u)
6696 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6697 : omp::OMPRTL___kmpc_dispatch_init_8u);
6698
6700}
6701
6703 bool IVSigned) {
6704 assert((IVSize == 32 || IVSize == 64) &&
6705 "IV size is not compatible with the omp runtime");
6706 RuntimeFunction Name = IVSize == 32
6707 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6708 : omp::OMPRTL___kmpc_dispatch_next_4u)
6709 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6710 : omp::OMPRTL___kmpc_dispatch_next_8u);
6711
6713}
6714
6716 bool IVSigned) {
6717 assert((IVSize == 32 || IVSize == 64) &&
6718 "IV size is not compatible with the omp runtime");
6719 RuntimeFunction Name = IVSize == 32
6720 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6721 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6722 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6723 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6724
6726}
6727
6729 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6730}
6731
6733 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6737 SmallVector<Type *> ParameterTypes;
6738 if (OMPBuilder.Config.isTargetDevice()) {
6739 // Add the "implicit" runtime argument we use to provide launch specific
6740 // information for target devices.
6741 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6742 ParameterTypes.push_back(Int8PtrTy);
6743
6744 // All parameters to target devices are passed as pointers
6745 // or i64. This assumes 64-bit address spaces/pointers.
6746 for (auto &Arg : Inputs)
6747 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6748 ? Arg->getType()
6749 : Type::getInt64Ty(Builder.getContext()));
6750 } else {
6751 for (auto &Arg : Inputs)
6752 ParameterTypes.push_back(Arg->getType());
6753 }
6754
6755 auto BB = Builder.GetInsertBlock();
6756 auto M = BB->getModule();
6757 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6758 /*isVarArg*/ false);
6759 auto Func =
6760 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6761
6762 // Save insert point.
6763 IRBuilder<>::InsertPointGuard IPG(Builder);
6764 // If there's a DISubprogram associated with current function, then
6765 // generate one for the outlined function.
6766 if (Function *ParentFunc = BB->getParent()) {
6767 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6768 DICompileUnit *CU = SP->getUnit();
6769 DIBuilder DB(*M, true, CU);
6771 if (DL) {
6772 // TODO: We are using nullopt for arguments at the moment. This will
6773 // need to be updated when debug data is being generated for variables.
6774 DISubroutineType *Ty =
6775 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6776 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6777 DISubprogram::SPFlagOptimized |
6778 DISubprogram::SPFlagLocalToUnit;
6779
6780 DISubprogram *OutlinedSP = DB.createFunction(
6781 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6782 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6783
6784 // Attach subprogram to the function.
6785 Func->setSubprogram(OutlinedSP);
6786 // Update the CurrentDebugLocation in the builder so that right scope
6787 // is used for things inside outlined function.
6789 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6790 OutlinedSP, DL.getInlinedAt()));
6791 }
6792 }
6793 }
6794
6795 // Generate the region into the function.
6796 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6797 Builder.SetInsertPoint(EntryBB);
6798
6799 // Insert target init call in the device compilation pass.
6800 if (OMPBuilder.Config.isTargetDevice())
6801 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6802
6803 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6804
6805 // As we embed the user code in the middle of our target region after we
6806 // generate entry code, we must move what allocas we can into the entry
6807 // block to avoid possible breaking optimisations for device
6808 if (OMPBuilder.Config.isTargetDevice())
6810
6811 // Insert target deinit call in the device compilation pass.
6812 BasicBlock *OutlinedBodyBB =
6813 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6815 Builder.saveIP(),
6816 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6817 if (!AfterIP)
6818 return AfterIP.takeError();
6819 Builder.restoreIP(*AfterIP);
6820 if (OMPBuilder.Config.isTargetDevice())
6821 OMPBuilder.createTargetDeinit(Builder);
6822
6823 // Insert return instruction.
6824 Builder.CreateRetVoid();
6825
6826 // New Alloca IP at entry point of created device function.
6827 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6828 auto AllocaIP = Builder.saveIP();
6829
6830 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6831
6832 // Skip the artificial dyn_ptr on the device.
6833 const auto &ArgRange =
6834 OMPBuilder.Config.isTargetDevice()
6835 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6836 : Func->args();
6837
6838 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6839 // Things like GEP's can come in the form of Constants. Constants and
6840 // ConstantExpr's do not have access to the knowledge of what they're
6841 // contained in, so we must dig a little to find an instruction so we
6842 // can tell if they're used inside of the function we're outlining. We
6843 // also replace the original constant expression with a new instruction
6844 // equivalent; an instruction as it allows easy modification in the
6845 // following loop, as we can now know the constant (instruction) is
6846 // owned by our target function and replaceUsesOfWith can now be invoked
6847 // on it (cannot do this with constants it seems). A brand new one also
6848 // allows us to be cautious as it is perhaps possible the old expression
6849 // was used inside of the function but exists and is used externally
6850 // (unlikely by the nature of a Constant, but still).
6851 // NOTE: We cannot remove dead constants that have been rewritten to
6852 // instructions at this stage, we run the risk of breaking later lowering
6853 // by doing so as we could still be in the process of lowering the module
6854 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6855 // constants we have created rewritten versions of.
6856 if (auto *Const = dyn_cast<Constant>(Input))
6857 convertUsersOfConstantsToInstructions(Const, Func, false);
6858
6859 // Collect all the instructions
6860 for (User *User : make_early_inc_range(Input->users()))
6861 if (auto *Instr = dyn_cast<Instruction>(User))
6862 if (Instr->getFunction() == Func)
6863 Instr->replaceUsesOfWith(Input, InputCopy);
6864 };
6865
6866 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6867
6868 // Rewrite uses of input valus to parameters.
6869 for (auto InArg : zip(Inputs, ArgRange)) {
6870 Value *Input = std::get<0>(InArg);
6871 Argument &Arg = std::get<1>(InArg);
6872 Value *InputCopy = nullptr;
6873
6875 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6876 if (!AfterIP)
6877 return AfterIP.takeError();
6878 Builder.restoreIP(*AfterIP);
6879
6880 // In certain cases a Global may be set up for replacement, however, this
6881 // Global may be used in multiple arguments to the kernel, just segmented
6882 // apart, for example, if we have a global array, that is sectioned into
6883 // multiple mappings (technically not legal in OpenMP, but there is a case
6884 // in Fortran for Common Blocks where this is neccesary), we will end up
6885 // with GEP's into this array inside the kernel, that refer to the Global
6886 // but are technically seperate arguments to the kernel for all intents and
6887 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6888 // index, it will fold into an referal to the Global, if we then encounter
6889 // this folded GEP during replacement all of the references to the
6890 // Global in the kernel will be replaced with the argument we have generated
6891 // that corresponds to it, including any other GEP's that refer to the
6892 // Global that may be other arguments. This will invalidate all of the other
6893 // preceding mapped arguments that refer to the same global that may be
6894 // seperate segments. To prevent this, we defer global processing until all
6895 // other processing has been performed.
6896 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6897 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6898 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6899 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6900 continue;
6901 }
6902
6903 ReplaceValue(Input, InputCopy, Func);
6904 }
6905
6906 // Replace all of our deferred Input values, currently just Globals.
6907 for (auto Deferred : DeferredReplacement)
6908 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6909
6910 return Func;
6911}
6912
6913/// Create an entry point for a target task with the following.
6914/// It'll have the following signature
6915/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6916/// This function is called from emitTargetTask once the
6917/// code to launch the target kernel has been outlined already.
6919 IRBuilderBase &Builder,
6920 CallInst *StaleCI) {
6921 Module &M = OMPBuilder.M;
6922 // KernelLaunchFunction is the target launch function, i.e.
6923 // the function that sets up kernel arguments and calls
6924 // __tgt_target_kernel to launch the kernel on the device.
6925 //
6926 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6927
6928 // StaleCI is the CallInst which is the call to the outlined
6929 // target kernel launch function. If there are values that the
6930 // outlined function uses then these are aggregated into a structure
6931 // which is passed as the second argument. If not, then there's
6932 // only one argument, the threadID. So, StaleCI can be
6933 //
6934 // %structArg = alloca { ptr, ptr }, align 8
6935 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6936 // store ptr %20, ptr %gep_, align 8
6937 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6938 // store ptr %21, ptr %gep_8, align 8
6939 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6940 //
6941 // OR
6942 //
6943 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6945 StaleCI->getIterator());
6946 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6947 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6948 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6949 Type *TaskTy = OMPBuilder.Task;
6950 auto ProxyFnTy =
6951 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6952 /* isVarArg */ false);
6953 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6954 ".omp_target_task_proxy_func",
6955 Builder.GetInsertBlock()->getModule());
6956 ProxyFn->getArg(0)->setName("thread.id");
6957 ProxyFn->getArg(1)->setName("task");
6958
6959 BasicBlock *EntryBB =
6960 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6961 Builder.SetInsertPoint(EntryBB);
6962
6963 bool HasShareds = StaleCI->arg_size() > 1;
6964 // TODO: This is a temporary assert to prove to ourselves that
6965 // the outlined target launch function is always going to have
6966 // atmost two arguments if there is any data shared between
6967 // host and device.
6968 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
6969 "StaleCI with shareds should have exactly two arguments.");
6970 if (HasShareds) {
6971 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6972 assert(ArgStructAlloca &&
6973 "Unable to find the alloca instruction corresponding to arguments "
6974 "for extracted function");
6975 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
6976
6977 AllocaInst *NewArgStructAlloca =
6978 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
6979 Value *TaskT = ProxyFn->getArg(1);
6980 Value *ThreadId = ProxyFn->getArg(0);
6981 Value *SharedsSize =
6982 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
6983
6984 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
6985 LoadInst *LoadShared =
6986 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
6987
6988 Builder.CreateMemCpy(
6989 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
6990 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
6991
6992 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
6993 }
6994 Builder.CreateRetVoid();
6995 return ProxyFn;
6996}
6997
6999 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7000 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
7001 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
7004
7005 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7006 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
7007 &ArgAccessorFuncCB](StringRef EntryFnName) {
7008 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
7009 CBFunc, ArgAccessorFuncCB);
7010 };
7011
7012 return OMPBuilder.emitTargetRegionFunction(
7013 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7014 OutlinedFnID);
7015}
7016
7018 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7021 bool HasNoWait) {
7022
7023 // The following explains the code-gen scenario for the `target` directive. A
7024 // similar scneario is followed for other device-related directives (e.g.
7025 // `target enter data`) but in similar fashion since we only need to emit task
7026 // that encapsulates the proper runtime call.
7027 //
7028 // When we arrive at this function, the target region itself has been
7029 // outlined into the function OutlinedFn.
7030 // So at ths point, for
7031 // --------------------------------------------------
7032 // void user_code_that_offloads(...) {
7033 // omp target depend(..) map(from:a) map(to:b, c)
7034 // a = b + c
7035 // }
7036 //
7037 // --------------------------------------------------
7038 //
7039 // we have
7040 //
7041 // --------------------------------------------------
7042 //
7043 // void user_code_that_offloads(...) {
7044 // %.offload_baseptrs = alloca [3 x ptr], align 8
7045 // %.offload_ptrs = alloca [3 x ptr], align 8
7046 // %.offload_mappers = alloca [3 x ptr], align 8
7047 // ;; target region has been outlined and now we need to
7048 // ;; offload to it via a target task.
7049 // }
7050 // void outlined_device_function(ptr a, ptr b, ptr c) {
7051 // *a = *b + *c
7052 // }
7053 //
7054 // We have to now do the following
7055 // (i) Make an offloading call to outlined_device_function using the OpenMP
7056 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7057 // emitted by emitKernelLaunch
7058 // (ii) Create a task entry point function that calls kernel_launch_function
7059 // and is the entry point for the target task. See
7060 // '@.omp_target_task_proxy_func in the pseudocode below.
7061 // (iii) Create a task with the task entry point created in (ii)
7062 //
7063 // That is we create the following
7064 //
7065 // void user_code_that_offloads(...) {
7066 // %.offload_baseptrs = alloca [3 x ptr], align 8
7067 // %.offload_ptrs = alloca [3 x ptr], align 8
7068 // %.offload_mappers = alloca [3 x ptr], align 8
7069 //
7070 // %structArg = alloca { ptr, ptr, ptr }, align 8
7071 // %strucArg[0] = %.offload_baseptrs
7072 // %strucArg[1] = %.offload_ptrs
7073 // %strucArg[2] = %.offload_mappers
7074 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7075 // @.omp_target_task_proxy_func)
7076 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7077 // dependencies_array = ...
7078 // ;; if nowait not present
7079 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7080 // call @__kmpc_omp_task_begin_if0(...)
7081 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7082 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7083 // }
7084 //
7085 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7086 // ptr %task) {
7087 // %structArg = alloca {ptr, ptr, ptr}
7088 // %shared_data = load (getelementptr %task, 0, 0)
7089 // mempcy(%structArg, %shared_data, sizeof(structArg))
7090 // kernel_launch_function(%thread.id, %structArg)
7091 // }
7092 //
7093 // We need the proxy function because the signature of the task entry point
7094 // expected by kmpc_omp_task is always the same and will be different from
7095 // that of the kernel_launch function.
7096 //
7097 // kernel_launch_function is generated by emitKernelLaunch and has the
7098 // always_inline attribute.
7099 // void kernel_launch_function(thread_id,
7100 // structArg) alwaysinline {
7101 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7102 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7103 // offload_ptrs = load(getelementptr structArg, 0, 1)
7104 // offload_mappers = load(getelementptr structArg, 0, 2)
7105 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7106 // ; offload_mappers
7107 // call i32 @__tgt_target_kernel(...,
7108 // outlined_device_function,
7109 // ptr %kernel_args)
7110 // }
7111 // void outlined_device_function(ptr a, ptr b, ptr c) {
7112 // *a = *b + *c
7113 // }
7114 //
7115 BasicBlock *TargetTaskBodyBB =
7116 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7117 BasicBlock *TargetTaskAllocaBB =
7118 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7119
7120 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7121 TargetTaskAllocaBB->begin());
7122 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7123
7124 OutlineInfo OI;
7125 OI.EntryBB = TargetTaskAllocaBB;
7126 OI.OuterAllocaBB = AllocaIP.getBlock();
7127
7128 // Add the thread ID argument.
7131 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7132
7133 Builder.restoreIP(TargetTaskBodyIP);
7134
7135 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7136 return Err;
7137
7138 OI.ExitBB = Builder.saveIP().getBlock();
7139 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7140 DeviceID](Function &OutlinedFn) mutable {
7141 assert(OutlinedFn.getNumUses() == 1 &&
7142 "there must be a single user for the outlined function");
7143
7144 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7145 bool HasShareds = StaleCI->arg_size() > 1;
7146
7147 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7148
7149 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7150 << "\n");
7151
7152 Builder.SetInsertPoint(StaleCI);
7153
7154 // Gather the arguments for emitting the runtime call.
7155 uint32_t SrcLocStrSize;
7156 Constant *SrcLocStr =
7158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7159
7160 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7161 //
7162 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7163 // the DeviceID to the deferred task and also since
7164 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7165 Function *TaskAllocFn =
7166 !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7168 OMPRTL___kmpc_omp_target_task_alloc);
7169
7170 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7171 // call.
7172 Value *ThreadID = getOrCreateThreadID(Ident);
7173
7174 // Argument - `sizeof_kmp_task_t` (TaskSize)
7175 // Tasksize refers to the size in bytes of kmp_task_t data structure
7176 // including private vars accessed in task.
7177 // TODO: add kmp_task_t_with_privates (privates)
7178 Value *TaskSize =
7180
7181 // Argument - `sizeof_shareds` (SharedsSize)
7182 // SharedsSize refers to the shareds array size in the kmp_task_t data
7183 // structure.
7184 Value *SharedsSize = Builder.getInt64(0);
7185 if (HasShareds) {
7186 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7187 assert(ArgStructAlloca &&
7188 "Unable to find the alloca instruction corresponding to arguments "
7189 "for extracted function");
7190 auto *ArgStructType =
7191 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7192 assert(ArgStructType && "Unable to find struct type corresponding to "
7193 "arguments for extracted function");
7194 SharedsSize =
7196 }
7197
7198 // Argument - `flags`
7199 // Task is tied iff (Flags & 1) == 1.
7200 // Task is untied iff (Flags & 1) == 0.
7201 // Task is final iff (Flags & 2) == 2.
7202 // Task is not final iff (Flags & 2) == 0.
7203 // A target task is not final and is untied.
7205
7206 // Emit the @__kmpc_omp_task_alloc runtime call
7207 // The runtime call returns a pointer to an area where the task captured
7208 // variables must be copied before the task is run (TaskData)
7209 CallInst *TaskData = nullptr;
7210
7211 SmallVector<llvm::Value *> TaskAllocArgs = {
7212 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7213 /*flags=*/Flags,
7214 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7215 /*task_func=*/ProxyFn};
7216
7217 if (HasNoWait)
7218 TaskAllocArgs.push_back(DeviceID);
7219
7220 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7221
7222 if (HasShareds) {
7223 Value *Shareds = StaleCI->getArgOperand(1);
7224 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7225 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7226 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7227 SharedsSize);
7228 }
7229
7230 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7231
7232 // ---------------------------------------------------------------
7233 // V5.2 13.8 target construct
7234 // If the nowait clause is present, execution of the target task
7235 // may be deferred. If the nowait clause is not present, the target task is
7236 // an included task.
7237 // ---------------------------------------------------------------
7238 // The above means that the lack of a nowait on the target construct
7239 // translates to '#pragma omp task if(0)'
7240 if (!HasNoWait) {
7241 if (DepArray) {
7242 Function *TaskWaitFn =
7243 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7245 TaskWaitFn,
7246 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7247 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7248 /*dep_list=*/DepArray,
7249 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7250 /*noalias_dep_list=*/
7252 }
7253 // Included task.
7254 Function *TaskBeginFn =
7255 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7256 Function *TaskCompleteFn =
7257 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7258 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7259 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7260 CI->setDebugLoc(StaleCI->getDebugLoc());
7261 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7262 } else if (DepArray) {
7263 // HasNoWait - meaning the task may be deferred. Call
7264 // __kmpc_omp_task_with_deps if there are dependencies,
7265 // else call __kmpc_omp_task
7266 Function *TaskFn =
7267 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7269 TaskFn,
7270 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7271 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7273 } else {
7274 // Emit the @__kmpc_omp_task runtime call to spawn the task
7275 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7276 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7277 }
7278
7279 StaleCI->eraseFromParent();
7280 for (Instruction *I : llvm::reverse(ToBeDeleted))
7281 I->eraseFromParent();
7282 };
7283 addOutlineInfo(std::move(OI));
7284
7285 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7286 << *(Builder.GetInsertBlock()) << "\n");
7287 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7289 << "\n");
7290 return Builder.saveIP();
7291}
7292
7294 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7295 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7296 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7297 function_ref<Value *(unsigned int)> CustomMapperCB) {
7298 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7299 DeviceAddrCB, CustomMapperCB);
7300 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7301}
7302
7303static void
7305 OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
7306 Constant *OutlinedFnID, ArrayRef<int32_t> NumTeams,
7310 bool HasNoWait = false) {
7311 // Generate a function call to the host fallback implementation of the target
7312 // region. This is called by the host when no offload entry was generated for
7313 // the target region and when the offloading call fails at runtime.
7314 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7316 Builder.restoreIP(IP);
7317 Builder.CreateCall(OutlinedFn, Args);
7318 return Builder.saveIP();
7319 };
7320
7321 bool HasDependencies = Dependencies.size() > 0;
7322 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7323
7325
7326 auto TaskBodyCB =
7327 [&](Value *DeviceID, Value *RTLoc,
7328 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7330 // emitKernelLaunch makes the necessary runtime call to offload the
7331 // kernel. We then outline all that code into a separate function
7332 // ('kernel_launch_function' in the pseudo code above). This function is
7333 // then called by the target task proxy function (see
7334 // '@.omp_target_task_proxy_func' in the pseudo code above)
7335 // "@.omp_target_task_proxy_func' is generated by
7336 // emitTargetTaskProxyFunction.
7337 if (OutlinedFnID)
7338 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7339 EmitTargetCallFallbackCB, KArgs,
7340 DeviceID, RTLoc, TargetTaskAllocaIP);
7341 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7342 // In this case, we execute the host implementation directly.
7343 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7344 }();
7345
7346 if (!AfterIP)
7347 return AfterIP.takeError();
7348
7349 OMPBuilder.Builder.restoreIP(*AfterIP);
7350 return Error::success();
7351 };
7352
7353 // If we don't have an ID for the target region, it means an offload entry
7354 // wasn't created. In this case we just run the host fallback directly.
7355 if (!OutlinedFnID) {
7357 if (RequiresOuterTargetTask) {
7358 // Arguments that are intended to be directly forwarded to an
7359 // emitKernelLaunch call are pased as nullptr, since
7360 // OutlinedFnID=nullptr results in that call not being done.
7361 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7362 /*RTLoc=*/nullptr, AllocaIP,
7363 Dependencies, HasNoWait);
7364 }
7365 return EmitTargetCallFallbackCB(Builder.saveIP());
7366 }();
7367
7368 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7369 // produce any. The 'if' check enables accessing the returned value.
7370 if (AfterIP)
7371 Builder.restoreIP(*AfterIP);
7372 return;
7373 }
7374
7376 /*RequiresDevicePointerInfo=*/false,
7377 /*SeparateBeginEndCalls=*/true);
7378
7379 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7381 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7382 RTArgs, MapInfo,
7383 /*IsNonContiguous=*/true,
7384 /*ForEndCall=*/false);
7385
7386 SmallVector<Value *, 3> NumTeamsC;
7387 SmallVector<Value *, 3> NumThreadsC;
7388 for (auto V : NumTeams)
7389 NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7390 for (auto V : NumThreads)
7391 NumThreadsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7392
7393 unsigned NumTargetItems = Info.NumberOfPtrs;
7394 // TODO: Use correct device ID
7395 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7396 uint32_t SrcLocStrSize;
7397 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7398 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7399 llvm::omp::IdentFlag(0), 0);
7400 // TODO: Use correct NumIterations
7401 Value *NumIterations = Builder.getInt64(0);
7402 // TODO: Use correct DynCGGroupMem
7403 Value *DynCGGroupMem = Builder.getInt32(0);
7404
7406 NumTargetItems, RTArgs, NumIterations, NumTeamsC, NumThreadsC,
7407 DynCGGroupMem, HasNoWait);
7408
7409 // The presence of certain clauses on the target directive require the
7410 // explicit generation of the target task.
7412 if (RequiresOuterTargetTask)
7413 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7414 Dependencies, HasNoWait);
7415
7416 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7417 EmitTargetCallFallbackCB, KArgs,
7418 DeviceID, RTLoc, AllocaIP);
7419 }();
7420
7421 // Assume no error was returned because TaskBodyCB and
7422 // EmitTargetCallFallbackCB don't produce any. The 'if' check enables
7423 // accessing the returned value.
7424 if (AfterIP)
7425 Builder.restoreIP(*AfterIP);
7426}
7427
7429 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7430 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7431 ArrayRef<int32_t> NumTeams, ArrayRef<int32_t> NumThreads,
7435 SmallVector<DependData> Dependencies, bool HasNowait) {
7436
7437 if (!updateToLocation(Loc))
7438 return InsertPointTy();
7439
7440 Builder.restoreIP(CodeGenIP);
7441
7442 Function *OutlinedFn;
7443 Constant *OutlinedFnID = nullptr;
7444 // The target region is outlined into its own function. The LLVM IR for
7445 // the target region itself is generated using the callbacks CBFunc
7446 // and ArgAccessorFuncCB
7448 *this, Builder, IsOffloadEntry, EntryInfo, OutlinedFn, OutlinedFnID,
7449 Args, CBFunc, ArgAccessorFuncCB))
7450 return Err;
7451
7452 // If we are not on the target device, then we need to generate code
7453 // to make a remote call (offload) to the previously outlined function
7454 // that represents the target region. Do that now.
7455 if (!Config.isTargetDevice())
7456 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
7457 NumThreads, Args, GenMapInfoCB, Dependencies, HasNowait);
7458 return Builder.saveIP();
7459}
7460
7461std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7462 StringRef FirstSeparator,
7463 StringRef Separator) {
7464 SmallString<128> Buffer;
7466 StringRef Sep = FirstSeparator;
7467 for (StringRef Part : Parts) {
7468 OS << Sep << Part;
7469 Sep = Separator;
7470 }
7471 return OS.str().str();
7472}
7473
7474std::string
7476 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7477 Config.separator());
7478}
7479
7482 unsigned AddressSpace) {
7483 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7484 if (Elem.second) {
7485 assert(Elem.second->getValueType() == Ty &&
7486 "OMP internal variable has different type than requested");
7487 } else {
7488 // TODO: investigate the appropriate linkage type used for the global
7489 // variable for possibly changing that to internal or private, or maybe
7490 // create different versions of the function for different OMP internal
7491 // variables.
7492 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7495 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7496 Constant::getNullValue(Ty), Elem.first(),
7497 /*InsertBefore=*/nullptr,
7499 const DataLayout &DL = M.getDataLayout();
7500 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7501 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7502 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7503 Elem.second = GV;
7504 }
7505
7506 return Elem.second;
7507}
7508
7509Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7510 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7511 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7512 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7513}
7514
7517 Value *Null =
7518 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7519 Value *SizeGep =
7520 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7521 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7522 return SizePtrToInt;
7523}
7524
7527 std::string VarName) {
7528 llvm::Constant *MaptypesArrayInit =
7530 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7531 M, MaptypesArrayInit->getType(),
7532 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7533 VarName);
7534 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7535 return MaptypesArrayGlobal;
7536}
7537
7539 InsertPointTy AllocaIP,
7540 unsigned NumOperands,
7541 struct MapperAllocas &MapperAllocas) {
7542 if (!updateToLocation(Loc))
7543 return;
7544
7545 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7546 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7547 Builder.restoreIP(AllocaIP);
7548 AllocaInst *ArgsBase = Builder.CreateAlloca(
7549 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7550 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7551 ".offload_ptrs");
7552 AllocaInst *ArgSizes = Builder.CreateAlloca(
7553 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7554 Builder.restoreIP(Loc.IP);
7555 MapperAllocas.ArgsBase = ArgsBase;
7556 MapperAllocas.Args = Args;
7557 MapperAllocas.ArgSizes = ArgSizes;
7558}
7559
7561 Function *MapperFunc, Value *SrcLocInfo,
7562 Value *MaptypesArg, Value *MapnamesArg,
7564 int64_t DeviceID, unsigned NumOperands) {
7565 if (!updateToLocation(Loc))
7566 return;
7567
7568 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7569 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7570 Value *ArgsBaseGEP =
7572 {Builder.getInt32(0), Builder.getInt32(0)});
7573 Value *ArgsGEP =
7575 {Builder.getInt32(0), Builder.getInt32(0)});
7576 Value *ArgSizesGEP =
7578 {Builder.getInt32(0), Builder.getInt32(0)});
7579 Value *NullPtr =
7580 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7581 Builder.CreateCall(MapperFunc,
7582 {SrcLocInfo, Builder.getInt64(DeviceID),
7583 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7584 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7585}
7586
7588 TargetDataRTArgs &RTArgs,
7589 TargetDataInfo &Info,
7590 bool ForEndCall) {
7591 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7592 "expected region end call to runtime only when end call is separate");
7593 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7594 auto VoidPtrTy = UnqualPtrTy;
7595 auto VoidPtrPtrTy = UnqualPtrTy;
7596 auto Int64Ty = Type::getInt64Ty(M.getContext());
7597 auto Int64PtrTy = UnqualPtrTy;
7598
7599 if (!Info.NumberOfPtrs) {
7600 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7601 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7602 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7603 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7604 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7605 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7606 return;
7607 }
7608
7610 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7611 Info.RTArgs.BasePointersArray,
7612 /*Idx0=*/0, /*Idx1=*/0);
7614 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7615 /*Idx0=*/0,
7616 /*Idx1=*/0);
7618 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7619 /*Idx0=*/0, /*Idx1=*/0);
7621 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7622 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7623 : Info.RTArgs.MapTypesArray,
7624 /*Idx0=*/0,
7625 /*Idx1=*/0);
7626
7627 // Only emit the mapper information arrays if debug information is
7628 // requested.
7629 if (!Info.EmitDebug)
7630 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7631 else
7633 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7634 /*Idx0=*/0,
7635 /*Idx1=*/0);
7636 // If there is no user-defined mapper, set the mapper array to nullptr to
7637 // avoid an unnecessary data privatization
7638 if (!Info.HasMapper)
7639 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7640 else
7641 RTArgs.MappersArray =
7642 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7643}
7644
7646 InsertPointTy CodeGenIP,
7647 MapInfosTy &CombinedInfo,
7648 TargetDataInfo &Info) {
7650 CombinedInfo.NonContigInfo;
7651
7652 // Build an array of struct descriptor_dim and then assign it to
7653 // offload_args.
7654 //
7655 // struct descriptor_dim {
7656 // uint64_t offset;
7657 // uint64_t count;
7658 // uint64_t stride
7659 // };
7660 Type *Int64Ty = Builder.getInt64Ty();
7662 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7663 "struct.descriptor_dim");
7664
7665 enum { OffsetFD = 0, CountFD, StrideFD };
7666 // We need two index variable here since the size of "Dims" is the same as
7667 // the size of Components, however, the size of offset, count, and stride is
7668 // equal to the size of base declaration that is non-contiguous.
7669 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7670 // Skip emitting ir if dimension size is 1 since it cannot be
7671 // non-contiguous.
7672 if (NonContigInfo.Dims[I] == 1)
7673 continue;
7674 Builder.restoreIP(AllocaIP);
7675 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7676 AllocaInst *DimsAddr =
7677 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7678 Builder.restoreIP(CodeGenIP);
7679 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7680 unsigned RevIdx = EE - II - 1;
7681 Value *DimsLVal = Builder.CreateInBoundsGEP(
7682 DimsAddr->getAllocatedType(), DimsAddr,
7683 {Builder.getInt64(0), Builder.getInt64(II)});
7684 // Offset
7685 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7687 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7688 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7689 // Count
7690 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7692 NonContigInfo.Counts[L][RevIdx], CountLVal,
7693 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7694 // Stride
7695 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7697 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7698 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7699 }
7700 // args[I] = &dims
7701 Builder.restoreIP(CodeGenIP);
7703 DimsAddr, Builder.getPtrTy());
7705 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7706 Info.RTArgs.PointersArray, 0, I);
7709 ++L;
7710 }
7711}
7712
7713void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7714 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7715 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7716 BasicBlock *ExitBB, bool IsInit) {
7717 StringRef Prefix = IsInit ? ".init" : ".del";
7718
7719 // Evaluate if this is an array section.
7721 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7722 Value *IsArray =
7723 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7724 Value *DeleteBit = Builder.CreateAnd(
7725 MapType,
7727 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7728 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7729 Value *DeleteCond;
7730 Value *Cond;
7731 if (IsInit) {
7732 // base != begin?
7733 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7734 // IsPtrAndObj?
7735 Value *PtrAndObjBit = Builder.CreateAnd(
7736 MapType,
7738 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7739 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7740 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7741 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7742 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7743 DeleteCond = Builder.CreateIsNull(
7744 DeleteBit,
7745 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7746 } else {
7747 Cond = IsArray;
7748 DeleteCond = Builder.CreateIsNotNull(
7749 DeleteBit,
7750 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7751 }
7752 Cond = Builder.CreateAnd(Cond, DeleteCond);
7753 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7754
7755 emitBlock(BodyBB, MapperFn);
7756 // Get the array size by multiplying element size and element number (i.e., \p
7757 // Size).
7758 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7759 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7760 // memory allocation/deletion purpose only.
7761 Value *MapTypeArg = Builder.CreateAnd(
7762 MapType,
7764 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7765 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7766 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7767 MapTypeArg = Builder.CreateOr(
7768 MapTypeArg,
7770 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7771 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7772
7773 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7774 // data structure.
7775 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7776 ArraySize, MapTypeArg, MapName};
7778 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7779 OffloadingArgs);
7780}
7781
7783 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7784 llvm::Value *BeginArg)>
7785 GenMapInfoCB,
7786 Type *ElemTy, StringRef FuncName,
7787 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7788 SmallVector<Type *> Params;
7789 Params.emplace_back(Builder.getPtrTy());
7790 Params.emplace_back(Builder.getPtrTy());
7791 Params.emplace_back(Builder.getPtrTy());
7794 Params.emplace_back(Builder.getPtrTy());
7795
7796 auto *FnTy =
7797 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7798
7799 SmallString<64> TyStr;
7800 raw_svector_ostream Out(TyStr);
7801 Function *MapperFn =
7803 MapperFn->addFnAttr(Attribute::NoInline);
7804 MapperFn->addFnAttr(Attribute::NoUnwind);
7805 MapperFn->addParamAttr(0, Attribute::NoUndef);
7806 MapperFn->addParamAttr(1, Attribute::NoUndef);
7807 MapperFn->addParamAttr(2, Attribute::NoUndef);
7808 MapperFn->addParamAttr(3, Attribute::NoUndef);
7809 MapperFn->addParamAttr(4, Attribute::NoUndef);
7810 MapperFn->addParamAttr(5, Attribute::NoUndef);
7811
7812 // Start the mapper function code generation.
7813 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7814 auto SavedIP = Builder.saveIP();
7815 Builder.SetInsertPoint(EntryBB);
7816
7817 Value *MapperHandle = MapperFn->getArg(0);
7818 Value *BaseIn = MapperFn->getArg(1);
7819 Value *BeginIn = MapperFn->getArg(2);
7820 Value *Size = MapperFn->getArg(3);
7821 Value *MapType = MapperFn->getArg(4);
7822 Value *MapName = MapperFn->getArg(5);
7823
7824 // Compute the starting and end addresses of array elements.
7825 // Prepare common arguments for array initiation and deletion.
7826 // Convert the size in bytes into the number of array elements.
7827 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7829 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7830 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7831
7832 // Emit array initiation if this is an array section and \p MapType indicates
7833 // that memory allocation is required.
7834 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
7835 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7836 MapType, MapName, ElementSize, HeadBB,
7837 /*IsInit=*/true);
7838
7839 // Emit a for loop to iterate through SizeArg of elements and map all of them.
7840
7841 // Emit the loop header block.
7842 emitBlock(HeadBB, MapperFn);
7843 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
7844 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
7845 // Evaluate whether the initial condition is satisfied.
7846 Value *IsEmpty =
7847 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
7848 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
7849
7850 // Emit the loop body block.
7851 emitBlock(BodyBB, MapperFn);
7852 BasicBlock *LastBB = BodyBB;
7853 PHINode *PtrPHI =
7854 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
7855 PtrPHI->addIncoming(PtrBegin, HeadBB);
7856
7857 // Get map clause information. Fill up the arrays with all mapped variables.
7858 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
7859
7860 // Call the runtime API __tgt_mapper_num_components to get the number of
7861 // pre-existing components.
7862 Value *OffloadingArgs[] = {MapperHandle};
7863 Value *PreviousSize = Builder.CreateCall(
7864 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
7865 OffloadingArgs);
7866 Value *ShiftedPreviousSize =
7868
7869 // Fill up the runtime mapper handle for all components.
7870 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
7871 Value *CurBaseArg =
7872 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
7873 Value *CurBeginArg =
7875 Value *CurSizeArg = Info.Sizes[I];
7876 Value *CurNameArg = Info.Names.size()
7877 ? Info.Names[I]
7879
7880 // Extract the MEMBER_OF field from the map type.
7881 Value *OriMapType = Builder.getInt64(
7882 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7883 Info.Types[I]));
7884 Value *MemberMapType =
7885 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
7886
7887 // Combine the map type inherited from user-defined mapper with that
7888 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
7889 // bits of the \a MapType, which is the input argument of the mapper
7890 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
7891 // bits of MemberMapType.
7892 // [OpenMP 5.0], 1.2.6. map-type decay.
7893 // | alloc | to | from | tofrom | release | delete
7894 // ----------------------------------------------------------
7895 // alloc | alloc | alloc | alloc | alloc | release | delete
7896 // to | alloc | to | alloc | to | release | delete
7897 // from | alloc | alloc | from | from | release | delete
7898 // tofrom | alloc | to | from | tofrom | release | delete
7899 Value *LeftToFrom = Builder.CreateAnd(
7900 MapType,
7902 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7903 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7904 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7905 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
7906 BasicBlock *AllocElseBB =
7907 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
7908 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
7909 BasicBlock *ToElseBB =
7910 BasicBlock::Create(M.getContext(), "omp.type.to.else");
7911 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
7912 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
7913 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
7914 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
7915 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
7916 emitBlock(AllocBB, MapperFn);
7917 Value *AllocMapType = Builder.CreateAnd(
7918 MemberMapType,
7920 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7921 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7922 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7923 Builder.CreateBr(EndBB);
7924 emitBlock(AllocElseBB, MapperFn);
7925 Value *IsTo = Builder.CreateICmpEQ(
7926 LeftToFrom,
7928 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7929 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
7930 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
7931 // In case of to, clear OMP_MAP_FROM.
7932 emitBlock(ToBB, MapperFn);
7933 Value *ToMapType = Builder.CreateAnd(
7934 MemberMapType,
7936 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7937 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7938 Builder.CreateBr(EndBB);
7939 emitBlock(ToElseBB, MapperFn);
7940 Value *IsFrom = Builder.CreateICmpEQ(
7941 LeftToFrom,
7943 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7944 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7945 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
7946 // In case of from, clear OMP_MAP_TO.
7947 emitBlock(FromBB, MapperFn);
7948 Value *FromMapType = Builder.CreateAnd(
7949 MemberMapType,
7951 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7952 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
7953 // In case of tofrom, do nothing.
7954 emitBlock(EndBB, MapperFn);
7955 LastBB = EndBB;
7956 PHINode *CurMapType =
7957 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
7958 CurMapType->addIncoming(AllocMapType, AllocBB);
7959 CurMapType->addIncoming(ToMapType, ToBB);
7960 CurMapType->addIncoming(FromMapType, FromBB);
7961 CurMapType->addIncoming(MemberMapType, ToElseBB);
7962
7963 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
7964 CurSizeArg, CurMapType, CurNameArg};
7965 Function *ChildMapperFn = nullptr;
7966 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
7967 // Call the corresponding mapper function.
7968 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
7969 } else {
7970 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7971 // data structure.
7973 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7974 OffloadingArgs);
7975 }
7976 }
7977
7978 // Update the pointer to point to the next element that needs to be mapped,
7979 // and check whether we have mapped all elements.
7980 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
7981 "omp.arraymap.next");
7982 PtrPHI->addIncoming(PtrNext, LastBB);
7983 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
7984 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
7985 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
7986
7987 emitBlock(ExitBB, MapperFn);
7988 // Emit array deletion if this is an array section and \p MapType indicates
7989 // that deletion is required.
7990 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7991 MapType, MapName, ElementSize, DoneBB,
7992 /*IsInit=*/false);
7993
7994 // Emit the function exit block.
7995 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
7996
7998 Builder.restoreIP(SavedIP);
7999 return MapperFn;
8000}
8001
8003 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8004 TargetDataInfo &Info, bool IsNonContiguous,
8005 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8006 function_ref<Value *(unsigned int)> CustomMapperCB) {
8007
8008 // Reset the array information.
8009 Info.clearArrayInfo();
8010 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8011
8012 if (Info.NumberOfPtrs == 0)
8013 return;
8014
8015 Builder.restoreIP(AllocaIP);
8016 // Detect if we have any capture size requiring runtime evaluation of the
8017 // size so that a constant array could be eventually used.
8018 ArrayType *PointerArrayType =
8019 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8020
8021 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8022 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8023
8024 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8025 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8026 AllocaInst *MappersArray = Builder.CreateAlloca(
8027 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8028 Info.RTArgs.MappersArray = MappersArray;
8029
8030 // If we don't have any VLA types or other types that require runtime
8031 // evaluation, we can use a constant array for the map sizes, otherwise we
8032 // need to fill up the arrays as we do for the pointers.
8033 Type *Int64Ty = Builder.getInt64Ty();
8034 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8035 ConstantInt::get(Int64Ty, 0));
8036 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8037 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8038 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8039 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8040 if (IsNonContiguous &&
8041 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8042 CombinedInfo.Types[I] &
8043 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8044 ConstSizes[I] =
8045 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8046 else
8047 ConstSizes[I] = CI;
8048 continue;
8049 }
8050 }
8051 RuntimeSizes.set(I);
8052 }
8053
8054 if (RuntimeSizes.all()) {
8055 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8056 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8057 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8058 Builder.restoreIP(CodeGenIP);
8059 } else {
8060 auto *SizesArrayInit = ConstantArray::get(
8061 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8062 std::string Name = createPlatformSpecificName({"offload_sizes"});
8063 auto *SizesArrayGbl =
8064 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8065 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8066 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8067
8068 if (!RuntimeSizes.any()) {
8069 Info.RTArgs.SizesArray = SizesArrayGbl;
8070 } else {
8071 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8072 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8073 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8075 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8076 Buffer->setAlignment(OffloadSizeAlign);
8077 Builder.restoreIP(CodeGenIP);
8079 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8080 SizesArrayGbl, OffloadSizeAlign,
8082 IndexSize,
8083 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8084
8085 Info.RTArgs.SizesArray = Buffer;
8086 }
8087 Builder.restoreIP(CodeGenIP);
8088 }
8089
8090 // The map types are always constant so we don't need to generate code to
8091 // fill arrays. Instead, we create an array constant.
8093 for (auto mapFlag : CombinedInfo.Types)
8094 Mapping.push_back(
8095 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8096 mapFlag));
8097 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8098 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8099 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8100
8101 // The information types are only built if provided.
8102 if (!CombinedInfo.Names.empty()) {
8103 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8104 auto *MapNamesArrayGbl =
8105 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8106 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8107 Info.EmitDebug = true;
8108 } else {
8109 Info.RTArgs.MapNamesArray =
8111 Info.EmitDebug = false;
8112 }
8113
8114 // If there's a present map type modifier, it must not be applied to the end
8115 // of a region, so generate a separate map type array in that case.
8116 if (Info.separateBeginEndCalls()) {
8117 bool EndMapTypesDiffer = false;
8118 for (uint64_t &Type : Mapping) {
8119 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8120 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8121 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8122 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8123 EndMapTypesDiffer = true;
8124 }
8125 }
8126 if (EndMapTypesDiffer) {
8127 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8128 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8129 }
8130 }
8131
8132 PointerType *PtrTy = Builder.getPtrTy();
8133 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8134 Value *BPVal = CombinedInfo.BasePointers[I];
8136 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8137 0, I);
8138 Builder.CreateAlignedStore(BPVal, BP,
8140
8141 if (Info.requiresDevicePointerInfo()) {
8142 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8143 CodeGenIP = Builder.saveIP();
8144 Builder.restoreIP(AllocaIP);
8145 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8146 Builder.restoreIP(CodeGenIP);
8147 if (DeviceAddrCB)
8148 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8149 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8150 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8151 if (DeviceAddrCB)
8152 DeviceAddrCB(I, BP);
8153 }
8154 }
8155
8156 Value *PVal = CombinedInfo.Pointers[I];
8158 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8159 I);
8160 // TODO: Check alignment correct.
8163
8164 if (RuntimeSizes.test(I)) {
8166 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8167 /*Idx0=*/0,
8168 /*Idx1=*/I);
8170 Int64Ty,
8171 /*isSigned=*/true),
8172 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8173 }
8174 // Fill up the mapper array.
8175 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8176 Value *MFunc = ConstantPointerNull::get(PtrTy);
8177 if (CustomMapperCB)
8178 if (Value *CustomMFunc = CustomMapperCB(I))
8179 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8181 MappersArray->getAllocatedType(), MappersArray,
8182 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8184 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8185 }
8186
8187 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8188 Info.NumberOfPtrs == 0)
8189 return;
8190 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8191}
8192
8195
8196 if (!CurBB || CurBB->getTerminator()) {
8197 // If there is no insert point or the previous block is already
8198 // terminated, don't touch it.
8199 } else {
8200 // Otherwise, create a fall-through branch.
8202 }
8203
8205}
8206
8208 bool IsFinished) {
8210
8211 // Fall out of the current block (if necessary).
8212 emitBranch(BB);
8213
8214 if (IsFinished && BB->use_empty()) {
8215 BB->eraseFromParent();
8216 return;
8217 }
8218
8219 // Place the block after the current block, if possible, or else at
8220 // the end of the function.
8221 if (CurBB && CurBB->getParent())
8222 CurFn->insert(std::next(CurBB->getIterator()), BB);
8223 else
8224 CurFn->insert(CurFn->end(), BB);
8226}
8227
8229 BodyGenCallbackTy ElseGen,
8230 InsertPointTy AllocaIP) {
8231 // If the condition constant folds and can be elided, try to avoid emitting
8232 // the condition and the dead arm of the if/else.
8233 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8234 auto CondConstant = CI->getSExtValue();
8235 if (CondConstant)
8236 return ThenGen(AllocaIP, Builder.saveIP());
8237
8238 return ElseGen(AllocaIP, Builder.saveIP());
8239 }
8240
8242
8243 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8244 // emit the conditional branch.
8245 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8246 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8247 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8248 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8249 // Emit the 'then' code.
8250 emitBlock(ThenBlock, CurFn);
8251 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8252 return Err;
8253 emitBranch(ContBlock);
8254 // Emit the 'else' code if present.
8255 // There is no need to emit line number for unconditional branch.
8256 emitBlock(ElseBlock, CurFn);
8257 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8258 return Err;
8259 // There is no need to emit line number for unconditional branch.
8260 emitBranch(ContBlock);
8261 // Emit the continuation block for code after the if.
8262 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8263 return Error::success();
8264}
8265
8266bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8267 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8270 "Unexpected Atomic Ordering.");
8271
8272 bool Flush = false;
8274
8275 switch (AK) {
8276 case Read:
8279 FlushAO = AtomicOrdering::Acquire;
8280 Flush = true;
8281 }
8282 break;
8283 case Write:
8284 case Compare:
8285 case Update:
8288 FlushAO = AtomicOrdering::Release;
8289 Flush = true;
8290 }
8291 break;
8292 case Capture:
8293 switch (AO) {
8295 FlushAO = AtomicOrdering::Acquire;
8296 Flush = true;
8297 break;
8299 FlushAO = AtomicOrdering::Release;
8300 Flush = true;
8301 break;
8305 Flush = true;
8306 break;
8307 default:
8308 // do nothing - leave silently.
8309 break;
8310 }
8311 }
8312
8313 if (Flush) {
8314 // Currently Flush RT call still doesn't take memory_ordering, so for when
8315 // that happens, this tries to do the resolution of which atomic ordering
8316 // to use with but issue the flush call
8317 // TODO: pass `FlushAO` after memory ordering support is added
8318 (void)FlushAO;
8319 emitFlush(Loc);
8320 }
8321
8322 // for AO == AtomicOrdering::Monotonic and all other case combinations
8323 // do nothing
8324 return Flush;
8325}
8326
8330 AtomicOrdering AO) {
8331 if (!updateToLocation(Loc))
8332 return Loc.IP;
8333
8334 assert(X.Var->getType()->isPointerTy() &&
8335 "OMP Atomic expects a pointer to target memory");
8336 Type *XElemTy = X.ElemTy;
8337 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8338 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8339 "OMP atomic read expected a scalar type");
8340
8341 Value *XRead = nullptr;
8342
8343 if (XElemTy->isIntegerTy()) {
8344 LoadInst *XLD =
8345 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8346 XLD->setAtomic(AO);
8347 XRead = cast<Value>(XLD);
8348 } else if (XElemTy->isStructTy()) {
8349 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8350 // target does not support `atomicrmw` of the size of the struct
8351 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8352 OldVal->setAtomic(AO);
8353 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8354 unsigned LoadSize =
8355 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8356 OpenMPIRBuilder::AtomicInfo atomicInfo(
8357 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8358 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8359 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8360 XRead = AtomicLoadRes.first;
8361 OldVal->eraseFromParent();
8362 } else {
8363 // We need to perform atomic op as integer
8364 IntegerType *IntCastTy =
8366 LoadInst *XLoad =
8367 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8368 XLoad->setAtomic(AO);
8369 if (XElemTy->isFloatingPointTy()) {
8370 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8371 } else {
8372 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8373 }
8374 }
8375 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8376 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8377 return Builder.saveIP();
8378}
8379
8382 AtomicOpValue &X, Value *Expr,
8383 AtomicOrdering AO) {
8384 if (!updateToLocation(Loc))
8385 return Loc.IP;
8386
8387 assert(X.Var->getType()->isPointerTy() &&
8388 "OMP Atomic expects a pointer to target memory");
8389 Type *XElemTy = X.ElemTy;
8390 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8391 XElemTy->isPointerTy()) &&
8392 "OMP atomic write expected a scalar type");
8393
8394 if (XElemTy->isIntegerTy()) {
8395 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8396 XSt->setAtomic(AO);
8397 } else {
8398 // We need to bitcast and perform atomic op as integers
8399 IntegerType *IntCastTy =
8401 Value *ExprCast =
8402 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8403 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8404 XSt->setAtomic(AO);
8405 }
8406
8407 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8408 return Builder.saveIP();
8409}
8410
8412 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8413 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8414 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8415 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8416 if (!updateToLocation(Loc))
8417 return Loc.IP;
8418
8419 LLVM_DEBUG({
8420 Type *XTy = X.Var->getType();
8421 assert(XTy->isPointerTy() &&
8422 "OMP Atomic expects a pointer to target memory");
8423 Type *XElemTy = X.ElemTy;
8424 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8425 XElemTy->isPointerTy()) &&
8426 "OMP atomic update expected a scalar type");
8427 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8428 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8429 "OpenMP atomic does not support LT or GT operations");
8430 });
8431
8433 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8434 X.IsVolatile, IsXBinopExpr);
8435 if (!AtomicResult)
8436 return AtomicResult.takeError();
8437 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8438 return Builder.saveIP();
8439}
8440
8441// FIXME: Duplicating AtomicExpand
8442Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8443 AtomicRMWInst::BinOp RMWOp) {
8444 switch (RMWOp) {
8445 case AtomicRMWInst::Add:
8446 return Builder.CreateAdd(Src1, Src2);
8447 case AtomicRMWInst::Sub:
8448 return Builder.CreateSub(Src1, Src2);
8449 case AtomicRMWInst::And:
8450 return Builder.CreateAnd(Src1, Src2);
8452 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8453 case AtomicRMWInst::Or:
8454 return Builder.CreateOr(Src1, Src2);
8455 case AtomicRMWInst::Xor:
8456 return Builder.CreateXor(Src1, Src2);
8461 case AtomicRMWInst::Max:
8462 case AtomicRMWInst::Min:
8471 llvm_unreachable("Unsupported atomic update operation");
8472 }
8473 llvm_unreachable("Unsupported atomic update operation");
8474}
8475
8476Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8477 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8479 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8480 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8481 // or a complex datatype.
8482 bool emitRMWOp = false;
8483 switch (RMWOp) {
8484 case AtomicRMWInst::Add:
8485 case AtomicRMWInst::And:
8487 case AtomicRMWInst::Or:
8488 case AtomicRMWInst::Xor:
8490 emitRMWOp = XElemTy;
8491 break;
8492 case AtomicRMWInst::Sub:
8493 emitRMWOp = (IsXBinopExpr && XElemTy);
8494 break;
8495 default:
8496 emitRMWOp = false;
8497 }
8498 emitRMWOp &= XElemTy->isIntegerTy();
8499
8500 std::pair<Value *, Value *> Res;
8501 if (emitRMWOp) {
8502 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8503 // not needed except in case of postfix captures. Generate anyway for
8504 // consistency with the else part. Will be removed with any DCE pass.
8505 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8506 if (RMWOp == AtomicRMWInst::Xchg)
8507 Res.second = Res.first;
8508 else
8509 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8510 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8511 XElemTy->isStructTy()) {
8512 LoadInst *OldVal =
8513 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8514 OldVal->setAtomic(AO);
8515 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8516 unsigned LoadSize =
8517 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8518
8519 OpenMPIRBuilder::AtomicInfo atomicInfo(
8520 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8521 OldVal->getAlign(), true /* UseLibcall */, X);
8522 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8524 Instruction *CurBBTI = CurBB->getTerminator();
8525 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8526 BasicBlock *ExitBB =
8527 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8528 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8529 X->getName() + ".atomic.cont");
8530 ContBB->getTerminator()->eraseFromParent();
8531 Builder.restoreIP(AllocaIP);
8532 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8533 NewAtomicAddr->setName(X->getName() + "x.new.val");
8534 Builder.SetInsertPoint(ContBB);
8535 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8536 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8537 Value *OldExprVal = PHI;
8538 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8539 if (!CBResult)
8540 return CBResult.takeError();
8541 Value *Upd = *CBResult;
8542 Builder.CreateStore(Upd, NewAtomicAddr);
8545 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8546 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8547 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8548 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8549 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8550 OldVal->eraseFromParent();
8551 Res.first = OldExprVal;
8552 Res.second = Upd;
8553
8554 if (UnreachableInst *ExitTI =
8555 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8556 CurBBTI->eraseFromParent();
8557 Builder.SetInsertPoint(ExitBB);
8558 } else {
8559 Builder.SetInsertPoint(ExitTI);
8560 }
8561 } else {
8562 IntegerType *IntCastTy =
8564 LoadInst *OldVal =
8565 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8566 OldVal->setAtomic(AO);
8567 // CurBB
8568 // | /---\
8569 // ContBB |
8570 // | \---/
8571 // ExitBB
8573 Instruction *CurBBTI = CurBB->getTerminator();
8574 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8575 BasicBlock *ExitBB =
8576 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8577 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8578 X->getName() + ".atomic.cont");
8579 ContBB->getTerminator()->eraseFromParent();
8580 Builder.restoreIP(AllocaIP);
8581 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8582 NewAtomicAddr->setName(X->getName() + "x.new.val");
8583 Builder.SetInsertPoint(ContBB);
8584 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8585 PHI->addIncoming(OldVal, CurBB);
8586 bool IsIntTy = XElemTy->isIntegerTy();
8587 Value *OldExprVal = PHI;
8588 if (!IsIntTy) {
8589 if (XElemTy->isFloatingPointTy()) {
8590 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8591 X->getName() + ".atomic.fltCast");
8592 } else {
8593 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8594 X->getName() + ".atomic.ptrCast");
8595 }
8596 }
8597
8598 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8599 if (!CBResult)
8600 return CBResult.takeError();
8601 Value *Upd = *CBResult;
8602 Builder.CreateStore(Upd, NewAtomicAddr);
8603 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8607 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8608 Result->setVolatile(VolatileX);
8609 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8610 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8611 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8612 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8613
8614 Res.first = OldExprVal;
8615 Res.second = Upd;
8616
8617 // set Insertion point in exit block
8618 if (UnreachableInst *ExitTI =
8619 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8620 CurBBTI->eraseFromParent();
8621 Builder.SetInsertPoint(ExitBB);
8622 } else {
8623 Builder.SetInsertPoint(ExitTI);
8624 }
8625 }
8626
8627 return Res;
8628}
8629
8631 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8632 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8634 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8635 if (!updateToLocation(Loc))
8636 return Loc.IP;
8637
8638 LLVM_DEBUG({
8639 Type *XTy = X.Var->getType();
8640 assert(XTy->isPointerTy() &&
8641 "OMP Atomic expects a pointer to target memory");
8642 Type *XElemTy = X.ElemTy;
8643 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8644 XElemTy->isPointerTy()) &&
8645 "OMP atomic capture expected a scalar type");
8646 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8647 "OpenMP atomic does not support LT or GT operations");
8648 });
8649
8650 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8651 // 'x' is simply atomically rewritten with 'expr'.
8652 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8654 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8655 X.IsVolatile, IsXBinopExpr);
8656 if (!AtomicResult)
8657 return AtomicResult.takeError();
8658 Value *CapturedVal =
8659 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8660 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8661
8662 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8663 return Builder.saveIP();
8664}
8665
8669 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8670 bool IsFailOnly) {
8671
8673 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8674 IsPostfixUpdate, IsFailOnly, Failure);
8675}
8676
8680 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8681 bool IsFailOnly, AtomicOrdering Failure) {
8682
8683 if (!updateToLocation(Loc))
8684 return Loc.IP;
8685
8686 assert(X.Var->getType()->isPointerTy() &&
8687 "OMP atomic expects a pointer to target memory");
8688 // compare capture
8689 if (V.Var) {
8690 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8691 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8692 }
8693
8694 bool IsInteger = E->getType()->isIntegerTy();
8695
8696 if (Op == OMPAtomicCompareOp::EQ) {
8697 AtomicCmpXchgInst *Result = nullptr;
8698 if (!IsInteger) {
8699 IntegerType *IntCastTy =
8700 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8701 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8702 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8703 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8704 AO, Failure);
8705 } else {
8706 Result =
8707 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8708 }
8709
8710 if (V.Var) {
8711 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8712 if (!IsInteger)
8713 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8714 assert(OldValue->getType() == V.ElemTy &&
8715 "OldValue and V must be of same type");
8716 if (IsPostfixUpdate) {
8717 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8718 } else {
8719 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8720 if (IsFailOnly) {
8721 // CurBB----
8722 // | |
8723 // v |
8724 // ContBB |
8725 // | |
8726 // v |
8727 // ExitBB <-
8728 //
8729 // where ContBB only contains the store of old value to 'v'.
8731 Instruction *CurBBTI = CurBB->getTerminator();
8732 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8733 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8734 CurBBTI, X.Var->getName() + ".atomic.exit");
8735 BasicBlock *ContBB = CurBB->splitBasicBlock(
8736 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8737 ContBB->getTerminator()->eraseFromParent();
8738 CurBB->getTerminator()->eraseFromParent();
8739
8740 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8741
8742 Builder.SetInsertPoint(ContBB);
8743 Builder.CreateStore(OldValue, V.Var);
8744 Builder.CreateBr(ExitBB);
8745
8746 if (UnreachableInst *ExitTI =
8747 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8748 CurBBTI->eraseFromParent();
8749 Builder.SetInsertPoint(ExitBB);
8750 } else {
8751 Builder.SetInsertPoint(ExitTI);
8752 }
8753 } else {
8754 Value *CapturedValue =
8755 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8756 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8757 }
8758 }
8759 }
8760 // The comparison result has to be stored.
8761 if (R.Var) {
8762 assert(R.Var->getType()->isPointerTy() &&
8763 "r.var must be of pointer type");
8764 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8765
8766 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8767 Value *ResultCast = R.IsSigned
8768 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8769 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8770 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8771 }
8772 } else {
8773 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8774 "Op should be either max or min at this point");
8775 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8776
8777 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8778 // Let's take max as example.
8779 // OpenMP form:
8780 // x = x > expr ? expr : x;
8781 // LLVM form:
8782 // *ptr = *ptr > val ? *ptr : val;
8783 // We need to transform to LLVM form.
8784 // x = x <= expr ? x : expr;
8786 if (IsXBinopExpr) {
8787 if (IsInteger) {
8788 if (X.IsSigned)
8789 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8791 else
8792 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8794 } else {
8795 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8797 }
8798 } else {
8799 if (IsInteger) {
8800 if (X.IsSigned)
8801 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8803 else
8804 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8806 } else {
8807 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8809 }
8810 }
8811
8812 AtomicRMWInst *OldValue =
8813 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8814 if (V.Var) {
8815 Value *CapturedValue = nullptr;
8816 if (IsPostfixUpdate) {
8817 CapturedValue = OldValue;
8818 } else {
8819 CmpInst::Predicate Pred;
8820 switch (NewOp) {
8821 case AtomicRMWInst::Max:
8822 Pred = CmpInst::ICMP_SGT;
8823 break;
8825 Pred = CmpInst::ICMP_UGT;
8826 break;
8828 Pred = CmpInst::FCMP_OGT;
8829 break;
8830 case AtomicRMWInst::Min:
8831 Pred = CmpInst::ICMP_SLT;
8832 break;
8834 Pred = CmpInst::ICMP_ULT;
8835 break;
8837 Pred = CmpInst::FCMP_OLT;
8838 break;
8839 default:
8840 llvm_unreachable("unexpected comparison op");
8841 }
8842 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8843 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8844 }
8845 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8846 }
8847 }
8848
8849 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8850
8851 return Builder.saveIP();
8852}
8853
8856 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8857 Value *NumTeamsUpper, Value *ThreadLimit,
8858 Value *IfExpr) {
8859 if (!updateToLocation(Loc))
8860 return InsertPointTy();
8861
8862 uint32_t SrcLocStrSize;
8863 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8864 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8865 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8866
8867 // Outer allocation basicblock is the entry block of the current function.
8868 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8869 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8870 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8871 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8872 }
8873
8874 // The current basic block is split into four basic blocks. After outlining,
8875 // they will be mapped as follows:
8876 // ```
8877 // def current_fn() {
8878 // current_basic_block:
8879 // br label %teams.exit
8880 // teams.exit:
8881 // ; instructions after teams
8882 // }
8883 //
8884 // def outlined_fn() {
8885 // teams.alloca:
8886 // br label %teams.body
8887 // teams.body:
8888 // ; instructions within teams body
8889 // }
8890 // ```
8891 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8892 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8893 BasicBlock *AllocaBB =
8894 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8895
8896 bool SubClausesPresent =
8897 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8898 // Push num_teams
8899 if (!Config.isTargetDevice() && SubClausesPresent) {
8900 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8901 "if lowerbound is non-null, then upperbound must also be non-null "
8902 "for bounds on num_teams");
8903
8904 if (NumTeamsUpper == nullptr)
8905 NumTeamsUpper = Builder.getInt32(0);
8906
8907 if (NumTeamsLower == nullptr)
8908 NumTeamsLower = NumTeamsUpper;
8909
8910 if (IfExpr) {
8911 assert(IfExpr->getType()->isIntegerTy() &&
8912 "argument to if clause must be an integer value");
8913
8914 // upper = ifexpr ? upper : 1
8915 if (IfExpr->getType() != Int1)
8916 IfExpr = Builder.CreateICmpNE(IfExpr,
8917 ConstantInt::get(IfExpr->getType(), 0));
8918 NumTeamsUpper = Builder.CreateSelect(
8919 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8920
8921 // lower = ifexpr ? lower : 1
8922 NumTeamsLower = Builder.CreateSelect(
8923 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8924 }
8925
8926 if (ThreadLimit == nullptr)
8927 ThreadLimit = Builder.getInt32(0);
8928
8929 Value *ThreadNum = getOrCreateThreadID(Ident);
8931 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
8932 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
8933 }
8934 // Generate the body of teams.
8935 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
8936 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
8937 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
8938 return Err;
8939
8940 OutlineInfo OI;
8941 OI.EntryBB = AllocaBB;
8942 OI.ExitBB = ExitBB;
8943 OI.OuterAllocaBB = &OuterAllocaBB;
8944
8945 // Insert fake values for global tid and bound tid.
8947 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
8949 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
8951 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
8952
8953 auto HostPostOutlineCB = [this, Ident,
8954 ToBeDeleted](Function &OutlinedFn) mutable {
8955 // The stale call instruction will be replaced with a new call instruction
8956 // for runtime call with the outlined function.
8957
8958 assert(OutlinedFn.getNumUses() == 1 &&
8959 "there must be a single user for the outlined function");
8960 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8961 ToBeDeleted.push_back(StaleCI);
8962
8963 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
8964 "Outlined function must have two or three arguments only");
8965
8966 bool HasShared = OutlinedFn.arg_size() == 3;
8967
8968 OutlinedFn.getArg(0)->setName("global.tid.ptr");
8969 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
8970 if (HasShared)
8971 OutlinedFn.getArg(2)->setName("data");
8972
8973 // Call to the runtime function for teams in the current function.
8974 assert(StaleCI && "Error while outlining - no CallInst user found for the "
8975 "outlined function.");
8976 Builder.SetInsertPoint(StaleCI);
8977 SmallVector<Value *> Args = {
8978 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
8979 if (HasShared)
8980 Args.push_back(StaleCI->getArgOperand(2));
8982 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
8983 Args);
8984
8985 for (Instruction *I : llvm::reverse(ToBeDeleted))
8986 I->eraseFromParent();
8987 };
8988
8989 if (!Config.isTargetDevice())
8990 OI.PostOutlineCB = HostPostOutlineCB;
8991
8992 addOutlineInfo(std::move(OI));
8993
8994 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
8995
8996 return Builder.saveIP();
8997}
8998
9001 std::string VarName) {
9002 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9004 Names.size()),
9005 Names);
9006 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9007 M, MapNamesArrayInit->getType(),
9008 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9009 VarName);
9010 return MapNamesArrayGlobal;
9011}
9012
9013// Create all simple and struct types exposed by the runtime and remember
9014// the llvm::PointerTypes of them for easy access later.
9015void OpenMPIRBuilder::initializeTypes(Module &M) {
9016 LLVMContext &Ctx = M.getContext();
9017 StructType *T;
9018#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9019#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9020 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9021 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
9022#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9023 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9024 VarName##Ptr = PointerType::getUnqual(VarName);
9025#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9026 T = StructType::getTypeByName(Ctx, StructName); \
9027 if (!T) \
9028 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9029 VarName = T; \
9030 VarName##Ptr = PointerType::getUnqual(T);
9031#include "llvm/Frontend/OpenMP/OMPKinds.def"
9032}
9033
9036 SmallVectorImpl<BasicBlock *> &BlockVector) {
9038 BlockSet.insert(EntryBB);
9039 BlockSet.insert(ExitBB);
9040
9041 Worklist.push_back(EntryBB);
9042 while (!Worklist.empty()) {
9043 BasicBlock *BB = Worklist.pop_back_val();
9044 BlockVector.push_back(BB);
9045 for (BasicBlock *SuccBB : successors(BB))
9046 if (BlockSet.insert(SuccBB).second)
9047 Worklist.push_back(SuccBB);
9048 }
9049}
9050
9052 uint64_t Size, int32_t Flags,
9054 StringRef Name) {
9055 if (!Config.isGPU()) {
9057 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9058 "omp_offloading_entries");
9059 return;
9060 }
9061 // TODO: Add support for global variables on the device after declare target
9062 // support.
9063 Function *Fn = dyn_cast<Function>(Addr);
9064 if (!Fn)
9065 return;
9066
9067 Module &M = *(Fn->getParent());
9068 LLVMContext &Ctx = M.getContext();
9069
9070 // Get "nvvm.annotations" metadata node.
9071 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9072
9073 Metadata *MDVals[] = {
9074 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9075 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9076 // Append metadata to nvvm.annotations.
9077 MD->addOperand(MDNode::get(Ctx, MDVals));
9078
9079 // Add a function attribute for the kernel.
9080 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9081 if (T.isAMDGCN())
9082 Fn->addFnAttr("uniform-work-group-size", "true");
9083 Fn->addFnAttr(Attribute::MustProgress);
9084}
9085
9086// We only generate metadata for function that contain target regions.
9089
9090 // If there are no entries, we don't need to do anything.
9092 return;
9093
9097 16>
9098 OrderedEntries(OffloadInfoManager.size());
9099
9100 // Auxiliary methods to create metadata values and strings.
9101 auto &&GetMDInt = [this](unsigned V) {
9102 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9103 };
9104
9105 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9106
9107 // Create the offloading info metadata node.
9108 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9109 auto &&TargetRegionMetadataEmitter =
9110 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9111 const TargetRegionEntryInfo &EntryInfo,
9113 // Generate metadata for target regions. Each entry of this metadata
9114 // contains:
9115 // - Entry 0 -> Kind of this type of metadata (0).
9116 // - Entry 1 -> Device ID of the file where the entry was identified.
9117 // - Entry 2 -> File ID of the file where the entry was identified.
9118 // - Entry 3 -> Mangled name of the function where the entry was
9119 // identified.
9120 // - Entry 4 -> Line in the file where the entry was identified.
9121 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9122 // - Entry 6 -> Order the entry was created.
9123 // The first element of the metadata node is the kind.
9124 Metadata *Ops[] = {
9125 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9126 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9127 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9128 GetMDInt(E.getOrder())};
9129
9130 // Save this entry in the right position of the ordered entries array.
9131 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9132
9133 // Add metadata to the named metadata node.
9134 MD->addOperand(MDNode::get(C, Ops));
9135 };
9136
9137 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9138
9139 // Create function that emits metadata for each device global variable entry;
9140 auto &&DeviceGlobalVarMetadataEmitter =
9141 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9142 StringRef MangledName,
9144 // Generate metadata for global variables. Each entry of this metadata
9145 // contains:
9146 // - Entry 0 -> Kind of this type of metadata (1).
9147 // - Entry 1 -> Mangled name of the variable.
9148 // - Entry 2 -> Declare target kind.
9149 // - Entry 3 -> Order the entry was created.
9150 // The first element of the metadata node is the kind.
9151 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9152 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9153
9154 // Save this entry in the right position of the ordered entries array.
9155 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9156 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9157
9158 // Add metadata to the named metadata node.
9159 MD->addOperand(MDNode::get(C, Ops));
9160 };
9161
9163 DeviceGlobalVarMetadataEmitter);
9164
9165 for (const auto &E : OrderedEntries) {
9166 assert(E.first && "All ordered entries must exist!");
9167 if (const auto *CE =
9168 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9169 E.first)) {
9170 if (!CE->getID() || !CE->getAddress()) {
9171 // Do not blame the entry if the parent funtion is not emitted.
9172 TargetRegionEntryInfo EntryInfo = E.second;
9173 StringRef FnName = EntryInfo.ParentName;
9174 if (!M.getNamedValue(FnName))
9175 continue;
9176 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9177 continue;
9178 }
9179 createOffloadEntry(CE->getID(), CE->getAddress(),
9180 /*Size=*/0, CE->getFlags(),
9182 } else if (const auto *CE = dyn_cast<
9184 E.first)) {
9187 CE->getFlags());
9188 switch (Flags) {
9192 continue;
9193 if (!CE->getAddress()) {
9194 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9195 continue;
9196 }
9197 // The vaiable has no definition - no need to add the entry.
9198 if (CE->getVarSize() == 0)
9199 continue;
9200 break;
9202 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9203 (!Config.isTargetDevice() && CE->getAddress())) &&
9204 "Declaret target link address is set.");
9205 if (Config.isTargetDevice())
9206 continue;
9207 if (!CE->getAddress()) {
9209 continue;
9210 }
9211 break;
9212 default:
9213 break;
9214 }
9215
9216 // Hidden or internal symbols on the device are not externally visible.
9217 // We should not attempt to register them by creating an offloading
9218 // entry. Indirect variables are handled separately on the device.
9219 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9220 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9222 continue;
9223
9224 // Indirect globals need to use a special name that doesn't match the name
9225 // of the associated host global.
9227 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9228 Flags, CE->getLinkage(), CE->getVarName());
9229 else
9230 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9231 Flags, CE->getLinkage());
9232
9233 } else {
9234 llvm_unreachable("Unsupported entry kind.");
9235 }
9236 }
9237
9238 // Emit requires directive globals to a special entry so the runtime can
9239 // register them when the device image is loaded.
9240 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9241 // entries should be redesigned to better suit this use-case.
9245 /*Name=*/"",
9247 Config.getRequiresFlags(), "omp_offloading_entries");
9248}
9249
9251 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9252 unsigned FileID, unsigned Line, unsigned Count) {
9254 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9255 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9256 if (Count)
9257 OS << "_" << Count;
9258}
9259
9262 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9264 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9265 EntryInfo.Line, NewCount);
9266}
9267
9270 StringRef ParentName) {
9272 auto FileIDInfo = CallBack();
9273 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9274 report_fatal_error(("Unable to get unique ID for file, during "
9275 "getTargetEntryUniqueInfo, error message: " +
9276 EC.message())
9277 .c_str());
9278 }
9279
9280 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9281 std::get<1>(FileIDInfo));
9282}
9283
9285 unsigned Offset = 0;
9286 for (uint64_t Remain =
9287 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9289 !(Remain & 1); Remain = Remain >> 1)
9290 Offset++;
9291 return Offset;
9292}
9293
9296 // Rotate by getFlagMemberOffset() bits.
9297 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9298 << getFlagMemberOffset());
9299}
9300
9303 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9304 // If the entry is PTR_AND_OBJ but has not been marked with the special
9305 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9306 // marked as MEMBER_OF.
9307 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9309 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9312 return;
9313
9314 // Reset the placeholder value to prepare the flag for the assignment of the
9315 // proper MEMBER_OF value.
9316 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9317 Flags |= MemberOfFlag;
9318}
9319
9323 bool IsDeclaration, bool IsExternallyVisible,
9324 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9325 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9326 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9327 std::function<Constant *()> GlobalInitializer,
9328 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9329 // TODO: convert this to utilise the IRBuilder Config rather than
9330 // a passed down argument.
9331 if (OpenMPSIMD)
9332 return nullptr;
9333
9336 CaptureClause ==
9339 SmallString<64> PtrName;
9340 {
9341 raw_svector_ostream OS(PtrName);
9342 OS << MangledName;
9343 if (!IsExternallyVisible)
9344 OS << format("_%x", EntryInfo.FileID);
9345 OS << "_decl_tgt_ref_ptr";
9346 }
9347
9348 Value *Ptr = M.getNamedValue(PtrName);
9349
9350 if (!Ptr) {
9351 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9352 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9353
9354 auto *GV = cast<GlobalVariable>(Ptr);
9355 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9356
9357 if (!Config.isTargetDevice()) {
9358 if (GlobalInitializer)
9359 GV->setInitializer(GlobalInitializer());
9360 else
9361 GV->setInitializer(GlobalValue);
9362 }
9363
9365 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9366 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9367 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9368 }
9369
9370 return cast<Constant>(Ptr);
9371 }
9372
9373 return nullptr;
9374}
9375
9379 bool IsDeclaration, bool IsExternallyVisible,
9380 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9381 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9382 std::vector<Triple> TargetTriple,
9383 std::function<Constant *()> GlobalInitializer,
9384 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9385 Constant *Addr) {
9387 (TargetTriple.empty() && !Config.isTargetDevice()))
9388 return;
9389
9391 StringRef VarName;
9392 int64_t VarSize;
9394
9396 CaptureClause ==
9400 VarName = MangledName;
9401 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9402
9403 if (!IsDeclaration)
9404 VarSize = divideCeil(
9406 else
9407 VarSize = 0;
9408 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9409
9410 // This is a workaround carried over from Clang which prevents undesired
9411 // optimisation of internal variables.
9412 if (Config.isTargetDevice() &&
9413 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9414 // Do not create a "ref-variable" if the original is not also available
9415 // on the host.
9417 return;
9418
9419 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9420
9421 if (!M.getNamedValue(RefName)) {
9422 Constant *AddrRef =
9423 getOrCreateInternalVariable(Addr->getType(), RefName);
9424 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9425 GvAddrRef->setConstant(true);
9426 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9427 GvAddrRef->setInitializer(Addr);
9428 GeneratedRefs.push_back(GvAddrRef);
9429 }
9430 }
9431 } else {
9434 else
9436
9437 if (Config.isTargetDevice()) {
9438 VarName = (Addr) ? Addr->getName() : "";
9439 Addr = nullptr;
9440 } else {
9442 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9443 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9444 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9445 VarName = (Addr) ? Addr->getName() : "";
9446 }
9447 VarSize = M.getDataLayout().getPointerSize();
9449 }
9450
9452 Flags, Linkage);
9453}
9454
9455/// Loads all the offload entries information from the host IR
9456/// metadata.
9458 // If we are in target mode, load the metadata from the host IR. This code has
9459 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9460
9462 if (!MD)
9463 return;
9464
9465 for (MDNode *MN : MD->operands()) {
9466 auto &&GetMDInt = [MN](unsigned Idx) {
9467 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9468 return cast<ConstantInt>(V->getValue())->getZExtValue();
9469 };
9470
9471 auto &&GetMDString = [MN](unsigned Idx) {
9472 auto *V = cast<MDString>(MN->getOperand(Idx));
9473 return V->getString();
9474 };
9475
9476 switch (GetMDInt(0)) {
9477 default:
9478 llvm_unreachable("Unexpected metadata!");
9479 break;
9482 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9483 /*DeviceID=*/GetMDInt(1),
9484 /*FileID=*/GetMDInt(2),
9485 /*Line=*/GetMDInt(4),
9486 /*Count=*/GetMDInt(5));
9488 /*Order=*/GetMDInt(6));
9489 break;
9490 }
9494 /*MangledName=*/GetMDString(1),
9496 /*Flags=*/GetMDInt(2)),
9497 /*Order=*/GetMDInt(3));
9498 break;
9499 }
9500 }
9501}
9502
9504 if (HostFilePath.empty())
9505 return;
9506
9507 auto Buf = MemoryBuffer::getFile(HostFilePath);
9508 if (std::error_code Err = Buf.getError()) {
9509 report_fatal_error(("error opening host file from host file path inside of "
9510 "OpenMPIRBuilder: " +
9511 Err.message())
9512 .c_str());
9513 }
9514
9515 LLVMContext Ctx;
9517 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9518 if (std::error_code Err = M.getError()) {
9520 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9521 .c_str());
9522 }
9523
9524 loadOffloadInfoMetadata(*M.get());
9525}
9526
9527//===----------------------------------------------------------------------===//
9528// OffloadEntriesInfoManager
9529//===----------------------------------------------------------------------===//
9530
9532 return OffloadEntriesTargetRegion.empty() &&
9533 OffloadEntriesDeviceGlobalVar.empty();
9534}
9535
9536unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9537 const TargetRegionEntryInfo &EntryInfo) const {
9538 auto It = OffloadEntriesTargetRegionCount.find(
9539 getTargetRegionEntryCountKey(EntryInfo));
9540 if (It == OffloadEntriesTargetRegionCount.end())
9541 return 0;
9542 return It->second;
9543}
9544
9545void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9546 const TargetRegionEntryInfo &EntryInfo) {
9547 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9548 EntryInfo.Count + 1;
9549}
9550
9551/// Initialize target region entry.
9553 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9554 OffloadEntriesTargetRegion[EntryInfo] =
9555 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9556 OMPTargetRegionEntryTargetRegion);
9557 ++OffloadingEntriesNum;
9558}
9559
9563 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9564
9565 // Update the EntryInfo with the next available count for this location.
9566 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9567
9568 // If we are emitting code for a target, the entry is already initialized,
9569 // only has to be registered.
9570 if (OMPBuilder->Config.isTargetDevice()) {
9571 // This could happen if the device compilation is invoked standalone.
9572 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9573 return;
9574 }
9575 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9576 Entry.setAddress(Addr);
9577 Entry.setID(ID);
9578 Entry.setFlags(Flags);
9579 } else {
9581 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9582 return;
9583 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9584 "Target region entry already registered!");
9585 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9586 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9587 ++OffloadingEntriesNum;
9588 }
9589 incrementTargetRegionEntryInfoCount(EntryInfo);
9590}
9591
9593 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9594
9595 // Update the EntryInfo with the next available count for this location.
9596 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9597
9598 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9599 if (It == OffloadEntriesTargetRegion.end()) {
9600 return false;
9601 }
9602 // Fail if this entry is already registered.
9603 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9604 return false;
9605 return true;
9606}
9607
9609 const OffloadTargetRegionEntryInfoActTy &Action) {
9610 // Scan all target region entries and perform the provided action.
9611 for (const auto &It : OffloadEntriesTargetRegion) {
9612 Action(It.first, It.second);
9613 }
9614}
9615
9617 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9618 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9619 ++OffloadingEntriesNum;
9620}
9621
9623 StringRef VarName, Constant *Addr, int64_t VarSize,
9625 if (OMPBuilder->Config.isTargetDevice()) {
9626 // This could happen if the device compilation is invoked standalone.
9627 if (!hasDeviceGlobalVarEntryInfo(VarName))
9628 return;
9629 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9630 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9631 if (Entry.getVarSize() == 0) {
9632 Entry.setVarSize(VarSize);
9633 Entry.setLinkage(Linkage);
9634 }
9635 return;
9636 }
9637 Entry.setVarSize(VarSize);
9638 Entry.setLinkage(Linkage);
9639 Entry.setAddress(Addr);
9640 } else {
9641 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9642 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9643 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9644 "Entry not initialized!");
9645 if (Entry.getVarSize() == 0) {
9646 Entry.setVarSize(VarSize);
9647 Entry.setLinkage(Linkage);
9648 }
9649 return;
9650 }
9652 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9653 Addr, VarSize, Flags, Linkage,
9654 VarName.str());
9655 else
9656 OffloadEntriesDeviceGlobalVar.try_emplace(
9657 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9658 ++OffloadingEntriesNum;
9659 }
9660}
9661
9664 // Scan all target region entries and perform the provided action.
9665 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9666 Action(E.getKey(), E.getValue());
9667}
9668
9669//===----------------------------------------------------------------------===//
9670// CanonicalLoopInfo
9671//===----------------------------------------------------------------------===//
9672
9673void CanonicalLoopInfo::collectControlBlocks(
9675 // We only count those BBs as control block for which we do not need to
9676 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9677 // flow. For consistency, this also means we do not add the Body block, which
9678 // is just the entry to the body code.
9679 BBs.reserve(BBs.size() + 6);
9680 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9681}
9682
9684 assert(isValid() && "Requires a valid canonical loop");
9685 for (BasicBlock *Pred : predecessors(Header)) {
9686 if (Pred != Latch)
9687 return Pred;
9688 }
9689 llvm_unreachable("Missing preheader");
9690}
9691
9692void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9693 assert(isValid() && "Requires a valid canonical loop");
9694
9695 Instruction *CmpI = &getCond()->front();
9696 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9697 CmpI->setOperand(1, TripCount);
9698
9699#ifndef NDEBUG
9700 assertOK();
9701#endif
9702}
9703
9704void CanonicalLoopInfo::mapIndVar(
9705 llvm::function_ref<Value *(Instruction *)> Updater) {
9706 assert(isValid() && "Requires a valid canonical loop");
9707
9708 Instruction *OldIV = getIndVar();
9709
9710 // Record all uses excluding those introduced by the updater. Uses by the
9711 // CanonicalLoopInfo itself to keep track of the number of iterations are
9712 // excluded.
9713 SmallVector<Use *> ReplacableUses;
9714 for (Use &U : OldIV->uses()) {
9715 auto *User = dyn_cast<Instruction>(U.getUser());
9716 if (!User)
9717 continue;
9718 if (User->getParent() == getCond())
9719 continue;
9720 if (User->getParent() == getLatch())
9721 continue;
9722 ReplacableUses.push_back(&U);
9723 }
9724
9725 // Run the updater that may introduce new uses
9726 Value *NewIV = Updater(OldIV);
9727
9728 // Replace the old uses with the value returned by the updater.
9729 for (Use *U : ReplacableUses)
9730 U->set(NewIV);
9731
9732#ifndef NDEBUG
9733 assertOK();
9734#endif
9735}
9736
9738#ifndef NDEBUG
9739 // No constraints if this object currently does not describe a loop.
9740 if (!isValid())
9741 return;
9742
9743 BasicBlock *Preheader = getPreheader();
9744 BasicBlock *Body = getBody();
9745 BasicBlock *After = getAfter();
9746
9747 // Verify standard control-flow we use for OpenMP loops.
9748 assert(Preheader);
9749 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9750 "Preheader must terminate with unconditional branch");
9751 assert(Preheader->getSingleSuccessor() == Header &&
9752 "Preheader must jump to header");
9753
9754 assert(Header);
9755 assert(isa<BranchInst>(Header->getTerminator()) &&
9756 "Header must terminate with unconditional branch");
9757 assert(Header->getSingleSuccessor() == Cond &&
9758 "Header must jump to exiting block");
9759
9760 assert(Cond);
9761 assert(Cond->getSinglePredecessor() == Header &&
9762 "Exiting block only reachable from header");
9763
9764 assert(isa<BranchInst>(Cond->getTerminator()) &&
9765 "Exiting block must terminate with conditional branch");
9766 assert(size(successors(Cond)) == 2 &&
9767 "Exiting block must have two successors");
9768 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9769 "Exiting block's first successor jump to the body");
9770 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9771 "Exiting block's second successor must exit the loop");
9772
9773 assert(Body);
9774 assert(Body->getSinglePredecessor() == Cond &&
9775 "Body only reachable from exiting block");
9776 assert(!isa<PHINode>(Body->front()));
9777
9778 assert(Latch);
9779 assert(isa<BranchInst>(Latch->getTerminator()) &&
9780 "Latch must terminate with unconditional branch");
9781 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9782 // TODO: To support simple redirecting of the end of the body code that has
9783 // multiple; introduce another auxiliary basic block like preheader and after.
9784 assert(Latch->getSinglePredecessor() != nullptr);
9785 assert(!isa<PHINode>(Latch->front()));
9786
9787 assert(Exit);
9788 assert(isa<BranchInst>(Exit->getTerminator()) &&
9789 "Exit block must terminate with unconditional branch");
9790 assert(Exit->getSingleSuccessor() == After &&
9791 "Exit block must jump to after block");
9792
9793 assert(After);
9794 assert(After->getSinglePredecessor() == Exit &&
9795 "After block only reachable from exit block");
9796 assert(After->empty() || !isa<PHINode>(After->front()));
9797
9798 Instruction *IndVar = getIndVar();
9799 assert(IndVar && "Canonical induction variable not found?");
9800 assert(isa<IntegerType>(IndVar->getType()) &&
9801 "Induction variable must be an integer");
9802 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9803 "Induction variable must be a PHI in the loop header");
9804 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9805 assert(
9806 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9807 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9808
9809 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9810 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9811 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9812 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9813 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9814 ->isOne());
9815
9816 Value *TripCount = getTripCount();
9817 assert(TripCount && "Loop trip count not found?");
9818 assert(IndVar->getType() == TripCount->getType() &&
9819 "Trip count and induction variable must have the same type");
9820
9821 auto *CmpI = cast<CmpInst>(&Cond->front());
9822 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9823 "Exit condition must be a signed less-than comparison");
9824 assert(CmpI->getOperand(0) == IndVar &&
9825 "Exit condition must compare the induction variable");
9826 assert(CmpI->getOperand(1) == TripCount &&
9827 "Exit condition must compare with the trip count");
9828#endif
9829}
9830
9832 Header = nullptr;
9833 Cond = nullptr;
9834 Latch = nullptr;
9835 Exit = nullptr;
9836}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:599
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:933
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:918
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1921
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2990
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:809
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:853
arg_iterator arg_begin()
Definition: Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:356
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:754
size_t arg_size() const
Definition: Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:289
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:292
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1416
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1065
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1848
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1886
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1780
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2561
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2293
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1814
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2050
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1305
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2198
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2554
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1255
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1979
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:600
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2044
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2146
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1378
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2210
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1420
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1382
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:540
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1873
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1732
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2403
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1186
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2269
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1386
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1163
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1797
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1458
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2032
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1517
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1133
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1920
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1966
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1810
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1369
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2141
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1429
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2587
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2448
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1861
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2018
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1539
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1157
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2301
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:500
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2281
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2224
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2582
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:583
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1833
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1498
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1561
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2379
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1446
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2065
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2156
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1553
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1733
iterator_range< op_iterator > operands()
Definition: Metadata.h:1829
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr)
Generator for #omp task
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:978
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1036
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1046
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:128
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:144
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:77
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:756
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61