LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 if (List.empty())
835 return;
836
837 // Convert List to what ConstantArray needs.
839 UsedArray.resize(List.size());
840 for (unsigned I = 0, E = List.size(); I != E; ++I)
842 cast<Constant>(&*List[I]), Builder.getPtrTy());
843
844 if (UsedArray.empty())
845 return;
846 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
847
848 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
849 ConstantArray::get(ATy, UsedArray), Name);
850
851 GV->setSection("llvm.metadata");
852}
853
856 OMPTgtExecModeFlags Mode) {
857 auto *Int8Ty = Builder.getInt8Ty();
858 auto *GVMode = new GlobalVariable(
859 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
860 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
861 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
862 return GVMode;
863}
864
866 uint32_t SrcLocStrSize,
867 IdentFlag LocFlags,
868 unsigned Reserve2Flags) {
869 // Enable "C-mode".
870 LocFlags |= OMP_IDENT_FLAG_KMPC;
871
872 Constant *&Ident =
873 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
874 if (!Ident) {
876 Constant *IdentData[] = {I32Null,
877 ConstantInt::get(Int32, uint32_t(LocFlags)),
878 ConstantInt::get(Int32, Reserve2Flags),
879 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
880 Constant *Initializer =
881 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
882
883 // Look for existing encoding of the location + flags, not needed but
884 // minimizes the difference to the existing solution while we transition.
885 for (GlobalVariable &GV : M.globals())
886 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
887 if (GV.getInitializer() == Initializer)
888 Ident = &GV;
889
890 if (!Ident) {
891 auto *GV = new GlobalVariable(
892 M, OpenMPIRBuilder::Ident,
893 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
896 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
897 GV->setAlignment(Align(8));
898 Ident = GV;
899 }
900 }
901
903}
904
906 uint32_t &SrcLocStrSize) {
907 SrcLocStrSize = LocStr.size();
908 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
909 if (!SrcLocStr) {
910 Constant *Initializer =
912
913 // Look for existing encoding of the location, not needed but minimizes the
914 // difference to the existing solution while we transition.
915 for (GlobalVariable &GV : M.globals())
916 if (GV.isConstant() && GV.hasInitializer() &&
917 GV.getInitializer() == Initializer)
918 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
919
920 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
921 /* AddressSpace */ 0, &M);
922 }
923 return SrcLocStr;
924}
925
927 StringRef FileName,
928 unsigned Line, unsigned Column,
929 uint32_t &SrcLocStrSize) {
930 SmallString<128> Buffer;
931 Buffer.push_back(';');
932 Buffer.append(FileName);
933 Buffer.push_back(';');
934 Buffer.append(FunctionName);
935 Buffer.push_back(';');
936 Buffer.append(std::to_string(Line));
937 Buffer.push_back(';');
938 Buffer.append(std::to_string(Column));
939 Buffer.push_back(';');
940 Buffer.push_back(';');
941 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
942}
943
944Constant *
946 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
947 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
948}
949
951 uint32_t &SrcLocStrSize,
952 Function *F) {
953 DILocation *DIL = DL.get();
954 if (!DIL)
955 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
956 StringRef FileName = M.getName();
957 if (DIFile *DIF = DIL->getFile())
958 if (std::optional<StringRef> Source = DIF->getSource())
959 FileName = *Source;
960 StringRef Function = DIL->getScope()->getSubprogram()->getName();
961 if (Function.empty() && F)
962 Function = F->getName();
963 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
964 DIL->getColumn(), SrcLocStrSize);
965}
966
968 uint32_t &SrcLocStrSize) {
969 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
970 Loc.IP.getBlock()->getParent());
971}
972
974 return Builder.CreateCall(
975 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
976 "omp_global_thread_num");
977}
978
981 bool ForceSimpleCall, bool CheckCancelFlag) {
982 if (!updateToLocation(Loc))
983 return Loc.IP;
984
985 // Build call __kmpc_cancel_barrier(loc, thread_id) or
986 // __kmpc_barrier(loc, thread_id);
987
988 IdentFlag BarrierLocFlags;
989 switch (Kind) {
990 case OMPD_for:
991 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
992 break;
993 case OMPD_sections:
994 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
995 break;
996 case OMPD_single:
997 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
998 break;
999 case OMPD_barrier:
1000 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1001 break;
1002 default:
1003 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1004 break;
1005 }
1006
1007 uint32_t SrcLocStrSize;
1008 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1009 Value *Args[] = {
1010 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1011 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1012
1013 // If we are in a cancellable parallel region, barriers are cancellation
1014 // points.
1015 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1016 bool UseCancelBarrier =
1017 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1018
1019 Value *Result =
1021 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1022 : OMPRTL___kmpc_barrier),
1023 Args);
1024
1025 if (UseCancelBarrier && CheckCancelFlag)
1026 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1027 return Err;
1028
1029 return Builder.saveIP();
1030}
1031
1034 Value *IfCondition,
1035 omp::Directive CanceledDirective) {
1036 if (!updateToLocation(Loc))
1037 return Loc.IP;
1038
1039 // LLVM utilities like blocks with terminators.
1040 auto *UI = Builder.CreateUnreachable();
1041
1042 Instruction *ThenTI = UI, *ElseTI = nullptr;
1043 if (IfCondition)
1044 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1045 Builder.SetInsertPoint(ThenTI);
1046
1047 Value *CancelKind = nullptr;
1048 switch (CanceledDirective) {
1049#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1050 case DirectiveEnum: \
1051 CancelKind = Builder.getInt32(Value); \
1052 break;
1053#include "llvm/Frontend/OpenMP/OMPKinds.def"
1054 default:
1055 llvm_unreachable("Unknown cancel kind!");
1056 }
1057
1058 uint32_t SrcLocStrSize;
1059 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1060 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1061 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1062 Value *Result = Builder.CreateCall(
1063 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1064 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1065 if (CanceledDirective == OMPD_parallel) {
1067 Builder.restoreIP(IP);
1069 omp::Directive::OMPD_unknown,
1070 /* ForceSimpleCall */ false,
1071 /* CheckCancelFlag */ false)
1072 .takeError();
1073 }
1074 return Error::success();
1075 };
1076
1077 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1078 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1079 return Err;
1080
1081 // Update the insertion point and remove the terminator we introduced.
1082 Builder.SetInsertPoint(UI->getParent());
1083 UI->eraseFromParent();
1084
1085 return Builder.saveIP();
1086}
1087
1089 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1090 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1091 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1092 if (!updateToLocation(Loc))
1093 return Loc.IP;
1094
1095 Builder.restoreIP(AllocaIP);
1096 auto *KernelArgsPtr =
1097 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1098 Builder.restoreIP(Loc.IP);
1099
1100 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1101 llvm::Value *Arg =
1102 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1104 KernelArgs[I], Arg,
1105 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1106 }
1107
1108 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1109 NumThreads, HostPtr, KernelArgsPtr};
1110
1111 Return = Builder.CreateCall(
1112 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1113 OffloadingArgs);
1114
1115 return Builder.saveIP();
1116}
1117
1119 const LocationDescription &Loc, Value *OutlinedFnID,
1120 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1121 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1122
1123 if (!updateToLocation(Loc))
1124 return Loc.IP;
1125
1126 Builder.restoreIP(Loc.IP);
1127 // On top of the arrays that were filled up, the target offloading call
1128 // takes as arguments the device id as well as the host pointer. The host
1129 // pointer is used by the runtime library to identify the current target
1130 // region, so it only has to be unique and not necessarily point to
1131 // anything. It could be the pointer to the outlined function that
1132 // implements the target region, but we aren't using that so that the
1133 // compiler doesn't need to keep that, and could therefore inline the host
1134 // function if proven worthwhile during optimization.
1135
1136 // From this point on, we need to have an ID of the target region defined.
1137 assert(OutlinedFnID && "Invalid outlined function ID!");
1138 (void)OutlinedFnID;
1139
1140 // Return value of the runtime offloading call.
1141 Value *Return = nullptr;
1142
1143 // Arguments for the target kernel.
1144 SmallVector<Value *> ArgsVector;
1145 getKernelArgsVector(Args, Builder, ArgsVector);
1146
1147 // The target region is an outlined function launched by the runtime
1148 // via calls to __tgt_target_kernel().
1149 //
1150 // Note that on the host and CPU targets, the runtime implementation of
1151 // these calls simply call the outlined function without forking threads.
1152 // The outlined functions themselves have runtime calls to
1153 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1154 // the compiler in emitTeamsCall() and emitParallelCall().
1155 //
1156 // In contrast, on the NVPTX target, the implementation of
1157 // __tgt_target_teams() launches a GPU kernel with the requested number
1158 // of teams and threads so no additional calls to the runtime are required.
1159 // Check the error code and execute the host version if required.
1161 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1162 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1163
1164 BasicBlock *OffloadFailedBlock =
1165 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1166 BasicBlock *OffloadContBlock =
1167 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1169 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1170
1171 auto CurFn = Builder.GetInsertBlock()->getParent();
1172 emitBlock(OffloadFailedBlock, CurFn);
1173 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1174 if (!AfterIP)
1175 return AfterIP.takeError();
1176 Builder.restoreIP(*AfterIP);
1177 emitBranch(OffloadContBlock);
1178 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1179 return Builder.saveIP();
1180}
1181
1183 Value *CancelFlag, omp::Directive CanceledDirective,
1184 FinalizeCallbackTy ExitCB) {
1185 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1186 "Unexpected cancellation!");
1187
1188 // For a cancel barrier we create two new blocks.
1190 BasicBlock *NonCancellationBlock;
1191 if (Builder.GetInsertPoint() == BB->end()) {
1192 // TODO: This branch will not be needed once we moved to the
1193 // OpenMPIRBuilder codegen completely.
1194 NonCancellationBlock = BasicBlock::Create(
1195 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1196 } else {
1197 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1200 }
1201 BasicBlock *CancellationBlock = BasicBlock::Create(
1202 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1203
1204 // Jump to them based on the return value.
1205 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1206 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1207 /* TODO weight */ nullptr, nullptr);
1208
1209 // From the cancellation block we finalize all variables and go to the
1210 // post finalization block that is known to the FiniCB callback.
1211 Builder.SetInsertPoint(CancellationBlock);
1212 if (ExitCB)
1213 if (Error Err = ExitCB(Builder.saveIP()))
1214 return Err;
1215 auto &FI = FinalizationStack.back();
1216 if (Error Err = FI.FiniCB(Builder.saveIP()))
1217 return Err;
1218
1219 // The continuation block is where code generation continues.
1220 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1221 return Error::success();
1222}
1223
1224// Callback used to create OpenMP runtime calls to support
1225// omp parallel clause for the device.
1226// We need to use this callback to replace call to the OutlinedFn in OuterFn
1227// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1229 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1230 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1231 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1232 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1233 // Add some known attributes.
1234 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1235 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1236 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1237 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1238 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1239 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1240
1241 assert(OutlinedFn.arg_size() >= 2 &&
1242 "Expected at least tid and bounded tid as arguments");
1243 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1244
1245 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1246 assert(CI && "Expected call instruction to outlined function");
1247 CI->getParent()->setName("omp_parallel");
1248
1249 Builder.SetInsertPoint(CI);
1250 Type *PtrTy = OMPIRBuilder->VoidPtr;
1251 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1252
1253 // Add alloca for kernel args
1254 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1255 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1256 AllocaInst *ArgsAlloca =
1257 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1258 Value *Args = ArgsAlloca;
1259 // Add address space cast if array for storing arguments is not allocated
1260 // in address space 0
1261 if (ArgsAlloca->getAddressSpace())
1262 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1263 Builder.restoreIP(CurrentIP);
1264
1265 // Store captured vars which are used by kmpc_parallel_51
1266 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1267 Value *V = *(CI->arg_begin() + 2 + Idx);
1268 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1269 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1270 Builder.CreateStore(V, StoreAddress);
1271 }
1272
1273 Value *Cond =
1274 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1275 : Builder.getInt32(1);
1276
1277 // Build kmpc_parallel_51 call
1278 Value *Parallel51CallArgs[] = {
1279 /* identifier*/ Ident,
1280 /* global thread num*/ ThreadID,
1281 /* if expression */ Cond,
1282 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1283 /* Proc bind */ Builder.getInt32(-1),
1284 /* outlined function */
1285 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1286 /* wrapper function */ NullPtrValue,
1287 /* arguments of the outlined funciton*/ Args,
1288 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1289
1290 FunctionCallee RTLFn =
1291 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1292
1293 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1294
1295 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1296 << *Builder.GetInsertBlock()->getParent() << "\n");
1297
1298 // Initialize the local TID stack location with the argument value.
1299 Builder.SetInsertPoint(PrivTID);
1300 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1301 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1302 PrivTIDAddr);
1303
1304 // Remove redundant call to the outlined function.
1305 CI->eraseFromParent();
1306
1307 for (Instruction *I : ToBeDeleted) {
1308 I->eraseFromParent();
1309 }
1310}
1311
1312// Callback used to create OpenMP runtime calls to support
1313// omp parallel clause for the host.
1314// We need to use this callback to replace call to the OutlinedFn in OuterFn
1315// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1316static void
1318 Function *OuterFn, Value *Ident, Value *IfCondition,
1319 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1320 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1321 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1322 FunctionCallee RTLFn;
1323 if (IfCondition) {
1324 RTLFn =
1325 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1326 } else {
1327 RTLFn =
1328 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1329 }
1330 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1331 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1332 LLVMContext &Ctx = F->getContext();
1333 MDBuilder MDB(Ctx);
1334 // Annotate the callback behavior of the __kmpc_fork_call:
1335 // - The callback callee is argument number 2 (microtask).
1336 // - The first two arguments of the callback callee are unknown (-1).
1337 // - All variadic arguments to the __kmpc_fork_call are passed to the
1338 // callback callee.
1339 F->addMetadata(LLVMContext::MD_callback,
1341 2, {-1, -1},
1342 /* VarArgsArePassed */ true)}));
1343 }
1344 }
1345 // Add some known attributes.
1346 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1347 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1348 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1349
1350 assert(OutlinedFn.arg_size() >= 2 &&
1351 "Expected at least tid and bounded tid as arguments");
1352 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1353
1354 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1355 CI->getParent()->setName("omp_parallel");
1356 Builder.SetInsertPoint(CI);
1357
1358 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1359 Value *ForkCallArgs[] = {
1360 Ident, Builder.getInt32(NumCapturedVars),
1361 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1362
1363 SmallVector<Value *, 16> RealArgs;
1364 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1365 if (IfCondition) {
1366 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1367 RealArgs.push_back(Cond);
1368 }
1369 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1370
1371 // __kmpc_fork_call_if always expects a void ptr as the last argument
1372 // If there are no arguments, pass a null pointer.
1373 auto PtrTy = OMPIRBuilder->VoidPtr;
1374 if (IfCondition && NumCapturedVars == 0) {
1375 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1376 RealArgs.push_back(NullPtrValue);
1377 }
1378 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1379 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1380
1381 Builder.CreateCall(RTLFn, RealArgs);
1382
1383 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1384 << *Builder.GetInsertBlock()->getParent() << "\n");
1385
1386 // Initialize the local TID stack location with the argument value.
1387 Builder.SetInsertPoint(PrivTID);
1388 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1389 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1390 PrivTIDAddr);
1391
1392 // Remove redundant call to the outlined function.
1393 CI->eraseFromParent();
1394
1395 for (Instruction *I : ToBeDeleted) {
1396 I->eraseFromParent();
1397 }
1398}
1399
1401 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1402 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1403 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1404 omp::ProcBindKind ProcBind, bool IsCancellable) {
1405 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1406
1407 if (!updateToLocation(Loc))
1408 return Loc.IP;
1409
1410 uint32_t SrcLocStrSize;
1411 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1412 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1413 Value *ThreadID = getOrCreateThreadID(Ident);
1414 // If we generate code for the target device, we need to allocate
1415 // struct for aggregate params in the device default alloca address space.
1416 // OpenMP runtime requires that the params of the extracted functions are
1417 // passed as zero address space pointers. This flag ensures that extracted
1418 // function arguments are declared in zero address space
1419 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1420
1421 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1422 // only if we compile for host side.
1423 if (NumThreads && !Config.isTargetDevice()) {
1424 Value *Args[] = {
1425 Ident, ThreadID,
1426 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1428 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1429 }
1430
1431 if (ProcBind != OMP_PROC_BIND_default) {
1432 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1433 Value *Args[] = {
1434 Ident, ThreadID,
1435 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1437 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1438 }
1439
1440 BasicBlock *InsertBB = Builder.GetInsertBlock();
1441 Function *OuterFn = InsertBB->getParent();
1442
1443 // Save the outer alloca block because the insertion iterator may get
1444 // invalidated and we still need this later.
1445 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1446
1447 // Vector to remember instructions we used only during the modeling but which
1448 // we want to delete at the end.
1450
1451 // Change the location to the outer alloca insertion point to create and
1452 // initialize the allocas we pass into the parallel region.
1453 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1454 Builder.restoreIP(NewOuter);
1455 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1456 AllocaInst *ZeroAddrAlloca =
1457 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1458 Instruction *TIDAddr = TIDAddrAlloca;
1459 Instruction *ZeroAddr = ZeroAddrAlloca;
1460 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1461 // Add additional casts to enforce pointers in zero address space
1462 TIDAddr = new AddrSpaceCastInst(
1463 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1464 TIDAddr->insertAfter(TIDAddrAlloca);
1465 ToBeDeleted.push_back(TIDAddr);
1466 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1467 PointerType ::get(M.getContext(), 0),
1468 "zero.addr.ascast");
1469 ZeroAddr->insertAfter(ZeroAddrAlloca);
1470 ToBeDeleted.push_back(ZeroAddr);
1471 }
1472
1473 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1474 // associated arguments in the outlined function, so we delete them later.
1475 ToBeDeleted.push_back(TIDAddrAlloca);
1476 ToBeDeleted.push_back(ZeroAddrAlloca);
1477
1478 // Create an artificial insertion point that will also ensure the blocks we
1479 // are about to split are not degenerated.
1480 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1481
1482 BasicBlock *EntryBB = UI->getParent();
1483 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1484 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1485 BasicBlock *PRegPreFiniBB =
1486 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1487 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1488
1489 auto FiniCBWrapper = [&](InsertPointTy IP) {
1490 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1491 // target to the region exit block.
1492 if (IP.getBlock()->end() == IP.getPoint()) {
1494 Builder.restoreIP(IP);
1495 Instruction *I = Builder.CreateBr(PRegExitBB);
1496 IP = InsertPointTy(I->getParent(), I->getIterator());
1497 }
1498 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1499 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1500 "Unexpected insertion point for finalization call!");
1501 return FiniCB(IP);
1502 };
1503
1504 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1505
1506 // Generate the privatization allocas in the block that will become the entry
1507 // of the outlined function.
1508 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1509 InsertPointTy InnerAllocaIP = Builder.saveIP();
1510
1511 AllocaInst *PrivTIDAddr =
1512 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1513 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1514
1515 // Add some fake uses for OpenMP provided arguments.
1516 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1517 Instruction *ZeroAddrUse =
1518 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1519 ToBeDeleted.push_back(ZeroAddrUse);
1520
1521 // EntryBB
1522 // |
1523 // V
1524 // PRegionEntryBB <- Privatization allocas are placed here.
1525 // |
1526 // V
1527 // PRegionBodyBB <- BodeGen is invoked here.
1528 // |
1529 // V
1530 // PRegPreFiniBB <- The block we will start finalization from.
1531 // |
1532 // V
1533 // PRegionExitBB <- A common exit to simplify block collection.
1534 //
1535
1536 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1537
1538 // Let the caller create the body.
1539 assert(BodyGenCB && "Expected body generation callback!");
1540 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1541 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1542 return Err;
1543
1544 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1545
1546 OutlineInfo OI;
1547 if (Config.isTargetDevice()) {
1548 // Generate OpenMP target specific runtime call
1549 OI.PostOutlineCB = [=, ToBeDeletedVec =
1550 std::move(ToBeDeleted)](Function &OutlinedFn) {
1551 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1552 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1553 ThreadID, ToBeDeletedVec);
1554 };
1555 } else {
1556 // Generate OpenMP host runtime call
1557 OI.PostOutlineCB = [=, ToBeDeletedVec =
1558 std::move(ToBeDeleted)](Function &OutlinedFn) {
1559 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1560 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1561 };
1562 }
1563
1564 OI.OuterAllocaBB = OuterAllocaBlock;
1565 OI.EntryBB = PRegEntryBB;
1566 OI.ExitBB = PRegExitBB;
1567
1568 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1570 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1571
1572 // Ensure a single exit node for the outlined region by creating one.
1573 // We might have multiple incoming edges to the exit now due to finalizations,
1574 // e.g., cancel calls that cause the control flow to leave the region.
1575 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1576 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1577 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1578 Blocks.push_back(PRegOutlinedExitBB);
1579
1580 CodeExtractorAnalysisCache CEAC(*OuterFn);
1581 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1582 /* AggregateArgs */ false,
1583 /* BlockFrequencyInfo */ nullptr,
1584 /* BranchProbabilityInfo */ nullptr,
1585 /* AssumptionCache */ nullptr,
1586 /* AllowVarArgs */ true,
1587 /* AllowAlloca */ true,
1588 /* AllocationBlock */ OuterAllocaBlock,
1589 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1590
1591 // Find inputs to, outputs from the code region.
1592 BasicBlock *CommonExit = nullptr;
1593 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1594 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1595
1596 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1597 /*CollectGlobalInputs=*/true);
1598
1599 Inputs.remove_if([&](Value *I) {
1600 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1601 return GV->getValueType() == OpenMPIRBuilder::Ident;
1602
1603 return false;
1604 });
1605
1606 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1607
1608 FunctionCallee TIDRTLFn =
1609 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1610
1611 auto PrivHelper = [&](Value &V) -> Error {
1612 if (&V == TIDAddr || &V == ZeroAddr) {
1613 OI.ExcludeArgsFromAggregate.push_back(&V);
1614 return Error::success();
1615 }
1616
1618 for (Use &U : V.uses())
1619 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1620 if (ParallelRegionBlockSet.count(UserI->getParent()))
1621 Uses.insert(&U);
1622
1623 // __kmpc_fork_call expects extra arguments as pointers. If the input
1624 // already has a pointer type, everything is fine. Otherwise, store the
1625 // value onto stack and load it back inside the to-be-outlined region. This
1626 // will ensure only the pointer will be passed to the function.
1627 // FIXME: if there are more than 15 trailing arguments, they must be
1628 // additionally packed in a struct.
1629 Value *Inner = &V;
1630 if (!V.getType()->isPointerTy()) {
1632 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1633
1634 Builder.restoreIP(OuterAllocaIP);
1635 Value *Ptr =
1636 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1637
1638 // Store to stack at end of the block that currently branches to the entry
1639 // block of the to-be-outlined region.
1640 Builder.SetInsertPoint(InsertBB,
1641 InsertBB->getTerminator()->getIterator());
1642 Builder.CreateStore(&V, Ptr);
1643
1644 // Load back next to allocations in the to-be-outlined region.
1645 Builder.restoreIP(InnerAllocaIP);
1646 Inner = Builder.CreateLoad(V.getType(), Ptr);
1647 }
1648
1649 Value *ReplacementValue = nullptr;
1650 CallInst *CI = dyn_cast<CallInst>(&V);
1651 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1652 ReplacementValue = PrivTID;
1653 } else {
1654 InsertPointOrErrorTy AfterIP =
1655 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1656 if (!AfterIP)
1657 return AfterIP.takeError();
1658 Builder.restoreIP(*AfterIP);
1659 InnerAllocaIP = {
1660 InnerAllocaIP.getBlock(),
1661 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1662
1663 assert(ReplacementValue &&
1664 "Expected copy/create callback to set replacement value!");
1665 if (ReplacementValue == &V)
1666 return Error::success();
1667 }
1668
1669 for (Use *UPtr : Uses)
1670 UPtr->set(ReplacementValue);
1671
1672 return Error::success();
1673 };
1674
1675 // Reset the inner alloca insertion as it will be used for loading the values
1676 // wrapped into pointers before passing them into the to-be-outlined region.
1677 // Configure it to insert immediately after the fake use of zero address so
1678 // that they are available in the generated body and so that the
1679 // OpenMP-related values (thread ID and zero address pointers) remain leading
1680 // in the argument list.
1681 InnerAllocaIP = IRBuilder<>::InsertPoint(
1682 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1683
1684 // Reset the outer alloca insertion point to the entry of the relevant block
1685 // in case it was invalidated.
1686 OuterAllocaIP = IRBuilder<>::InsertPoint(
1687 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1688
1689 for (Value *Input : Inputs) {
1690 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1691 if (Error Err = PrivHelper(*Input))
1692 return Err;
1693 }
1694 LLVM_DEBUG({
1695 for (Value *Output : Outputs)
1696 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1697 });
1698 assert(Outputs.empty() &&
1699 "OpenMP outlining should not produce live-out values!");
1700
1701 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1702 LLVM_DEBUG({
1703 for (auto *BB : Blocks)
1704 dbgs() << " PBR: " << BB->getName() << "\n";
1705 });
1706
1707 // Adjust the finalization stack, verify the adjustment, and call the
1708 // finalize function a last time to finalize values between the pre-fini
1709 // block and the exit block if we left the parallel "the normal way".
1710 auto FiniInfo = FinalizationStack.pop_back_val();
1711 (void)FiniInfo;
1712 assert(FiniInfo.DK == OMPD_parallel &&
1713 "Unexpected finalization stack state!");
1714
1715 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1716
1717 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1718 if (Error Err = FiniCB(PreFiniIP))
1719 return Err;
1720
1721 // Register the outlined info.
1722 addOutlineInfo(std::move(OI));
1723
1724 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1725 UI->eraseFromParent();
1726
1727 return AfterIP;
1728}
1729
1731 // Build call void __kmpc_flush(ident_t *loc)
1732 uint32_t SrcLocStrSize;
1733 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1734 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1735
1736 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1737}
1738
1740 if (!updateToLocation(Loc))
1741 return;
1742 emitFlush(Loc);
1743}
1744
1746 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1747 // global_tid);
1748 uint32_t SrcLocStrSize;
1749 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1750 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1751 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1752
1753 // Ignore return result until untied tasks are supported.
1754 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1755 Args);
1756}
1757
1759 if (!updateToLocation(Loc))
1760 return;
1761 emitTaskwaitImpl(Loc);
1762}
1763
1765 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1766 uint32_t SrcLocStrSize;
1767 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1768 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1770 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1771
1772 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1773 Args);
1774}
1775
1777 if (!updateToLocation(Loc))
1778 return;
1779 emitTaskyieldImpl(Loc);
1780}
1781
1782// Processes the dependencies in Dependencies and does the following
1783// - Allocates space on the stack of an array of DependInfo objects
1784// - Populates each DependInfo object with relevant information of
1785// the corresponding dependence.
1786// - All code is inserted in the entry block of the current function.
1788 OpenMPIRBuilder &OMPBuilder,
1790 // Early return if we have no dependencies to process
1791 if (Dependencies.empty())
1792 return nullptr;
1793
1794 // Given a vector of DependData objects, in this function we create an
1795 // array on the stack that holds kmp_dep_info objects corresponding
1796 // to each dependency. This is then passed to the OpenMP runtime.
1797 // For example, if there are 'n' dependencies then the following psedo
1798 // code is generated. Assume the first dependence is on a variable 'a'
1799 //
1800 // \code{c}
1801 // DepArray = alloc(n x sizeof(kmp_depend_info);
1802 // idx = 0;
1803 // DepArray[idx].base_addr = ptrtoint(&a);
1804 // DepArray[idx].len = 8;
1805 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1806 // ++idx;
1807 // DepArray[idx].base_addr = ...;
1808 // \endcode
1809
1810 IRBuilderBase &Builder = OMPBuilder.Builder;
1811 Type *DependInfo = OMPBuilder.DependInfo;
1812 Module &M = OMPBuilder.M;
1813
1814 Value *DepArray = nullptr;
1815 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1816 Builder.SetInsertPoint(
1818
1819 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1820 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1821
1822 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1823 Value *Base =
1824 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1825 // Store the pointer to the variable
1826 Value *Addr = Builder.CreateStructGEP(
1827 DependInfo, Base,
1828 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1829 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1830 Builder.CreateStore(DepValPtr, Addr);
1831 // Store the size of the variable
1832 Value *Size = Builder.CreateStructGEP(
1833 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1834 Builder.CreateStore(
1835 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1836 Size);
1837 // Store the dependency kind
1838 Value *Flags = Builder.CreateStructGEP(
1839 DependInfo, Base,
1840 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1841 Builder.CreateStore(
1842 ConstantInt::get(Builder.getInt8Ty(),
1843 static_cast<unsigned int>(Dep.DepKind)),
1844 Flags);
1845 }
1846 Builder.restoreIP(OldIP);
1847 return DepArray;
1848}
1849
1851 const LocationDescription &Loc, InsertPointTy AllocaIP,
1852 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1853 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle) {
1854
1855 if (!updateToLocation(Loc))
1856 return InsertPointTy();
1857
1858 uint32_t SrcLocStrSize;
1859 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1860 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1861 // The current basic block is split into four basic blocks. After outlining,
1862 // they will be mapped as follows:
1863 // ```
1864 // def current_fn() {
1865 // current_basic_block:
1866 // br label %task.exit
1867 // task.exit:
1868 // ; instructions after task
1869 // }
1870 // def outlined_fn() {
1871 // task.alloca:
1872 // br label %task.body
1873 // task.body:
1874 // ret void
1875 // }
1876 // ```
1877 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1878 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1879 BasicBlock *TaskAllocaBB =
1880 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1881
1882 InsertPointTy TaskAllocaIP =
1883 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1884 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1885 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1886 return Err;
1887
1888 OutlineInfo OI;
1889 OI.EntryBB = TaskAllocaBB;
1890 OI.OuterAllocaBB = AllocaIP.getBlock();
1891 OI.ExitBB = TaskExitBB;
1892
1893 // Add the thread ID argument.
1896 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1897
1898 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1899 Mergeable, EventHandle, TaskAllocaBB,
1900 ToBeDeleted](Function &OutlinedFn) mutable {
1901 // Replace the Stale CI by appropriate RTL function call.
1902 assert(OutlinedFn.getNumUses() == 1 &&
1903 "there must be a single user for the outlined function");
1904 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1905
1906 // HasShareds is true if any variables are captured in the outlined region,
1907 // false otherwise.
1908 bool HasShareds = StaleCI->arg_size() > 1;
1909 Builder.SetInsertPoint(StaleCI);
1910
1911 // Gather the arguments for emitting the runtime call for
1912 // @__kmpc_omp_task_alloc
1913 Function *TaskAllocFn =
1914 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1915
1916 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1917 // call.
1918 Value *ThreadID = getOrCreateThreadID(Ident);
1919
1920 // Argument - `flags`
1921 // Task is tied iff (Flags & 1) == 1.
1922 // Task is untied iff (Flags & 1) == 0.
1923 // Task is final iff (Flags & 2) == 2.
1924 // Task is not final iff (Flags & 2) == 0.
1925 // Task is mergeable iff (Flags & 4) == 4.
1926 // Task is not mergeable iff (Flags & 4) == 0.
1927 // TODO: Handle the other flags.
1928 Value *Flags = Builder.getInt32(Tied);
1929 if (Final) {
1930 Value *FinalFlag =
1932 Flags = Builder.CreateOr(FinalFlag, Flags);
1933 }
1934
1935 if (Mergeable)
1937
1938 // Argument - `sizeof_kmp_task_t` (TaskSize)
1939 // Tasksize refers to the size in bytes of kmp_task_t data structure
1940 // including private vars accessed in task.
1941 // TODO: add kmp_task_t_with_privates (privates)
1942 Value *TaskSize = Builder.getInt64(
1944
1945 // Argument - `sizeof_shareds` (SharedsSize)
1946 // SharedsSize refers to the shareds array size in the kmp_task_t data
1947 // structure.
1948 Value *SharedsSize = Builder.getInt64(0);
1949 if (HasShareds) {
1950 AllocaInst *ArgStructAlloca =
1951 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1952 assert(ArgStructAlloca &&
1953 "Unable to find the alloca instruction corresponding to arguments "
1954 "for extracted function");
1955 StructType *ArgStructType =
1956 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1957 assert(ArgStructType && "Unable to find struct type corresponding to "
1958 "arguments for extracted function");
1959 SharedsSize =
1961 }
1962 // Emit the @__kmpc_omp_task_alloc runtime call
1963 // The runtime call returns a pointer to an area where the task captured
1964 // variables must be copied before the task is run (TaskData)
1965 CallInst *TaskData = Builder.CreateCall(
1966 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1967 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1968 /*task_func=*/&OutlinedFn});
1969
1970 // Emit detach clause initialization.
1971 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
1972 // task_descriptor);
1973 if (EventHandle) {
1975 OMPRTL___kmpc_task_allow_completion_event);
1976 llvm::Value *EventVal =
1977 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
1978 llvm::Value *EventHandleAddr =
1980 Builder.getPtrTy(0));
1981 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
1982 Builder.CreateStore(EventVal, EventHandleAddr);
1983 }
1984 // Copy the arguments for outlined function
1985 if (HasShareds) {
1986 Value *Shareds = StaleCI->getArgOperand(1);
1987 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1988 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1989 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1990 SharedsSize);
1991 }
1992
1993 Value *DepArray = nullptr;
1994 if (Dependencies.size()) {
1995 InsertPointTy OldIP = Builder.saveIP();
1997 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1998
1999 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2000 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2001
2002 unsigned P = 0;
2003 for (const DependData &Dep : Dependencies) {
2004 Value *Base =
2005 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
2006 // Store the pointer to the variable
2008 DependInfo, Base,
2009 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2010 Value *DepValPtr =
2012 Builder.CreateStore(DepValPtr, Addr);
2013 // Store the size of the variable
2015 DependInfo, Base,
2016 static_cast<unsigned int>(RTLDependInfoFields::Len));
2018 Dep.DepValueType)),
2019 Size);
2020 // Store the dependency kind
2022 DependInfo, Base,
2023 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2025 ConstantInt::get(Builder.getInt8Ty(),
2026 static_cast<unsigned int>(Dep.DepKind)),
2027 Flags);
2028 ++P;
2029 }
2030
2031 Builder.restoreIP(OldIP);
2032 }
2033
2034 // In the presence of the `if` clause, the following IR is generated:
2035 // ...
2036 // %data = call @__kmpc_omp_task_alloc(...)
2037 // br i1 %if_condition, label %then, label %else
2038 // then:
2039 // call @__kmpc_omp_task(...)
2040 // br label %exit
2041 // else:
2042 // ;; Wait for resolution of dependencies, if any, before
2043 // ;; beginning the task
2044 // call @__kmpc_omp_wait_deps(...)
2045 // call @__kmpc_omp_task_begin_if0(...)
2046 // call @outlined_fn(...)
2047 // call @__kmpc_omp_task_complete_if0(...)
2048 // br label %exit
2049 // exit:
2050 // ...
2051 if (IfCondition) {
2052 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2053 // terminator.
2054 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2055 Instruction *IfTerminator =
2056 Builder.GetInsertPoint()->getParent()->getTerminator();
2057 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2058 Builder.SetInsertPoint(IfTerminator);
2059 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2060 &ElseTI);
2061 Builder.SetInsertPoint(ElseTI);
2062
2063 if (Dependencies.size()) {
2064 Function *TaskWaitFn =
2065 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2067 TaskWaitFn,
2068 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2069 ConstantInt::get(Builder.getInt32Ty(), 0),
2071 }
2072 Function *TaskBeginFn =
2073 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2074 Function *TaskCompleteFn =
2075 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2076 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2077 CallInst *CI = nullptr;
2078 if (HasShareds)
2079 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2080 else
2081 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2082 CI->setDebugLoc(StaleCI->getDebugLoc());
2083 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2084 Builder.SetInsertPoint(ThenTI);
2085 }
2086
2087 if (Dependencies.size()) {
2088 Function *TaskFn =
2089 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2091 TaskFn,
2092 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2093 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2095
2096 } else {
2097 // Emit the @__kmpc_omp_task runtime call to spawn the task
2098 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2099 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2100 }
2101
2102 StaleCI->eraseFromParent();
2103
2104 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2105 if (HasShareds) {
2106 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2107 OutlinedFn.getArg(1)->replaceUsesWithIf(
2108 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2109 }
2110
2111 for (Instruction *I : llvm::reverse(ToBeDeleted))
2112 I->eraseFromParent();
2113 };
2114
2115 addOutlineInfo(std::move(OI));
2116 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2117
2118 return Builder.saveIP();
2119}
2120
2123 InsertPointTy AllocaIP,
2124 BodyGenCallbackTy BodyGenCB) {
2125 if (!updateToLocation(Loc))
2126 return InsertPointTy();
2127
2128 uint32_t SrcLocStrSize;
2129 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2130 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2131 Value *ThreadID = getOrCreateThreadID(Ident);
2132
2133 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2134 Function *TaskgroupFn =
2135 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2136 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2137
2138 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2139 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2140 return Err;
2141
2142 Builder.SetInsertPoint(TaskgroupExitBB);
2143 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2144 Function *EndTaskgroupFn =
2145 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2146 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2147
2148 return Builder.saveIP();
2149}
2150
2152 const LocationDescription &Loc, InsertPointTy AllocaIP,
2154 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2155 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2156
2157 if (!updateToLocation(Loc))
2158 return Loc.IP;
2159
2160 auto FiniCBWrapper = [&](InsertPointTy IP) {
2161 if (IP.getBlock()->end() != IP.getPoint())
2162 return FiniCB(IP);
2163 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2164 // will fail because that function requires the Finalization Basic Block to
2165 // have a terminator, which is already removed by EmitOMPRegionBody.
2166 // IP is currently at cancelation block.
2167 // We need to backtrack to the condition block to fetch
2168 // the exit block and create a branch from cancelation
2169 // to exit block.
2171 Builder.restoreIP(IP);
2172 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2173 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2174 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2175 Instruction *I = Builder.CreateBr(ExitBB);
2176 IP = InsertPointTy(I->getParent(), I->getIterator());
2177 return FiniCB(IP);
2178 };
2179
2180 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2181
2182 // Each section is emitted as a switch case
2183 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2184 // -> OMP.createSection() which generates the IR for each section
2185 // Iterate through all sections and emit a switch construct:
2186 // switch (IV) {
2187 // case 0:
2188 // <SectionStmt[0]>;
2189 // break;
2190 // ...
2191 // case <NumSection> - 1:
2192 // <SectionStmt[<NumSection> - 1]>;
2193 // break;
2194 // }
2195 // ...
2196 // section_loop.after:
2197 // <FiniCB>;
2198 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2199 Builder.restoreIP(CodeGenIP);
2201 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2202 Function *CurFn = Continue->getParent();
2203 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2204
2205 unsigned CaseNumber = 0;
2206 for (auto SectionCB : SectionCBs) {
2208 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2209 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2210 Builder.SetInsertPoint(CaseBB);
2211 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2212 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2213 CaseEndBr->getIterator()}))
2214 return Err;
2215 CaseNumber++;
2216 }
2217 // remove the existing terminator from body BB since there can be no
2218 // terminators after switch/case
2219 return Error::success();
2220 };
2221 // Loop body ends here
2222 // LowerBound, UpperBound, and STride for createCanonicalLoop
2223 Type *I32Ty = Type::getInt32Ty(M.getContext());
2224 Value *LB = ConstantInt::get(I32Ty, 0);
2225 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2226 Value *ST = ConstantInt::get(I32Ty, 1);
2228 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2229 if (!LoopInfo)
2230 return LoopInfo.takeError();
2231
2232 InsertPointOrErrorTy WsloopIP =
2233 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2234 if (!WsloopIP)
2235 return WsloopIP.takeError();
2236 InsertPointTy AfterIP = *WsloopIP;
2237
2238 // Apply the finalization callback in LoopAfterBB
2239 auto FiniInfo = FinalizationStack.pop_back_val();
2240 assert(FiniInfo.DK == OMPD_sections &&
2241 "Unexpected finalization stack state!");
2242 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2243 Builder.restoreIP(AfterIP);
2244 BasicBlock *FiniBB =
2245 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2246 if (Error Err = CB(Builder.saveIP()))
2247 return Err;
2248 AfterIP = {FiniBB, FiniBB->begin()};
2249 }
2250
2251 return AfterIP;
2252}
2253
2256 BodyGenCallbackTy BodyGenCB,
2257 FinalizeCallbackTy FiniCB) {
2258 if (!updateToLocation(Loc))
2259 return Loc.IP;
2260
2261 auto FiniCBWrapper = [&](InsertPointTy IP) {
2262 if (IP.getBlock()->end() != IP.getPoint())
2263 return FiniCB(IP);
2264 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2265 // will fail because that function requires the Finalization Basic Block to
2266 // have a terminator, which is already removed by EmitOMPRegionBody.
2267 // IP is currently at cancelation block.
2268 // We need to backtrack to the condition block to fetch
2269 // the exit block and create a branch from cancelation
2270 // to exit block.
2272 Builder.restoreIP(IP);
2273 auto *CaseBB = Loc.IP.getBlock();
2274 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2275 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2276 Instruction *I = Builder.CreateBr(ExitBB);
2277 IP = InsertPointTy(I->getParent(), I->getIterator());
2278 return FiniCB(IP);
2279 };
2280
2281 Directive OMPD = Directive::OMPD_sections;
2282 // Since we are using Finalization Callback here, HasFinalize
2283 // and IsCancellable have to be true
2284 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2285 /*Conditional*/ false, /*hasFinalize*/ true,
2286 /*IsCancellable*/ true);
2287}
2288
2291 IT++;
2292 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2293}
2294
2295Value *OpenMPIRBuilder::getGPUThreadID() {
2296 return Builder.CreateCall(
2298 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2299 {});
2300}
2301
2302Value *OpenMPIRBuilder::getGPUWarpSize() {
2303 return Builder.CreateCall(
2304 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2305}
2306
2307Value *OpenMPIRBuilder::getNVPTXWarpID() {
2308 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2309 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2310}
2311
2312Value *OpenMPIRBuilder::getNVPTXLaneID() {
2313 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2314 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2315 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2316 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2317 "nvptx_lane_id");
2318}
2319
2320Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2321 Type *ToType) {
2322 Type *FromType = From->getType();
2323 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2324 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2325 assert(FromSize > 0 && "From size must be greater than zero");
2326 assert(ToSize > 0 && "To size must be greater than zero");
2327 if (FromType == ToType)
2328 return From;
2329 if (FromSize == ToSize)
2330 return Builder.CreateBitCast(From, ToType);
2331 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2332 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2333 InsertPointTy SaveIP = Builder.saveIP();
2334 Builder.restoreIP(AllocaIP);
2335 Value *CastItem = Builder.CreateAlloca(ToType);
2336 Builder.restoreIP(SaveIP);
2337
2339 CastItem, Builder.getPtrTy(0));
2340 Builder.CreateStore(From, ValCastItem);
2341 return Builder.CreateLoad(ToType, CastItem);
2342}
2343
2344Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2345 Value *Element,
2346 Type *ElementType,
2347 Value *Offset) {
2348 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2349 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2350
2351 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2352 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2353 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2354 Value *WarpSize =
2355 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2357 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2358 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2359 Value *WarpSizeCast =
2360 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2361 Value *ShuffleCall =
2362 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2363 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2364}
2365
2366void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2367 Value *DstAddr, Type *ElemType,
2368 Value *Offset, Type *ReductionArrayTy) {
2370 // Create the loop over the big sized data.
2371 // ptr = (void*)Elem;
2372 // ptrEnd = (void*) Elem + 1;
2373 // Step = 8;
2374 // while (ptr + Step < ptrEnd)
2375 // shuffle((int64_t)*ptr);
2376 // Step = 4;
2377 // while (ptr + Step < ptrEnd)
2378 // shuffle((int32_t)*ptr);
2379 // ...
2380 Type *IndexTy = Builder.getIndexTy(
2382 Value *ElemPtr = DstAddr;
2383 Value *Ptr = SrcAddr;
2384 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2385 if (Size < IntSize)
2386 continue;
2387 Type *IntType = Builder.getIntNTy(IntSize * 8);
2389 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2390 Value *SrcAddrGEP =
2391 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2393 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2394
2395 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2396 if ((Size / IntSize) > 1) {
2398 SrcAddrGEP, Builder.getPtrTy());
2399 BasicBlock *PreCondBB =
2400 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2401 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2402 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2403 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2404 emitBlock(PreCondBB, CurFunc);
2405 PHINode *PhiSrc =
2406 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2407 PhiSrc->addIncoming(Ptr, CurrentBB);
2408 PHINode *PhiDest =
2409 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2410 PhiDest->addIncoming(ElemPtr, CurrentBB);
2411 Ptr = PhiSrc;
2412 ElemPtr = PhiDest;
2413 Value *PtrDiff = Builder.CreatePtrDiff(
2414 Builder.getInt8Ty(), PtrEnd,
2417 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2418 ExitBB);
2419 emitBlock(ThenBB, CurFunc);
2420 Value *Res = createRuntimeShuffleFunction(
2421 AllocaIP,
2423 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2424 IntType, Offset);
2425 Builder.CreateAlignedStore(Res, ElemPtr,
2426 M.getDataLayout().getPrefTypeAlign(ElemType));
2427 Value *LocalPtr =
2428 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2429 Value *LocalElemPtr =
2430 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2431 PhiSrc->addIncoming(LocalPtr, ThenBB);
2432 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2433 emitBranch(PreCondBB);
2434 emitBlock(ExitBB, CurFunc);
2435 } else {
2436 Value *Res = createRuntimeShuffleFunction(
2437 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2438 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2439 Res->getType()->getScalarSizeInBits())
2440 Res = Builder.CreateTrunc(Res, ElemType);
2441 Builder.CreateStore(Res, ElemPtr);
2442 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2443 ElemPtr =
2444 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2445 }
2446 Size = Size % IntSize;
2447 }
2448}
2449
2450void OpenMPIRBuilder::emitReductionListCopy(
2451 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2452 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2453 CopyOptionsTy CopyOptions) {
2454 Type *IndexTy = Builder.getIndexTy(
2456 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2457
2458 // Iterates, element-by-element, through the source Reduce list and
2459 // make a copy.
2460 for (auto En : enumerate(ReductionInfos)) {
2461 const ReductionInfo &RI = En.value();
2462 Value *SrcElementAddr = nullptr;
2463 Value *DestElementAddr = nullptr;
2464 Value *DestElementPtrAddr = nullptr;
2465 // Should we shuffle in an element from a remote lane?
2466 bool ShuffleInElement = false;
2467 // Set to true to update the pointer in the dest Reduce list to a
2468 // newly created element.
2469 bool UpdateDestListPtr = false;
2470
2471 // Step 1.1: Get the address for the src element in the Reduce list.
2472 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2473 ReductionArrayTy, SrcBase,
2474 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2475 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2476
2477 // Step 1.2: Create a temporary to store the element in the destination
2478 // Reduce list.
2479 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2480 ReductionArrayTy, DestBase,
2481 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2482 switch (Action) {
2484 InsertPointTy CurIP = Builder.saveIP();
2485 Builder.restoreIP(AllocaIP);
2486 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2487 ".omp.reduction.element");
2488 DestAlloca->setAlignment(
2489 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2490 DestElementAddr = DestAlloca;
2491 DestElementAddr =
2492 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2493 DestElementAddr->getName() + ".ascast");
2494 Builder.restoreIP(CurIP);
2495 ShuffleInElement = true;
2496 UpdateDestListPtr = true;
2497 break;
2498 }
2500 DestElementAddr =
2501 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2502 break;
2503 }
2504 }
2505
2506 // Now that all active lanes have read the element in the
2507 // Reduce list, shuffle over the value from the remote lane.
2508 if (ShuffleInElement) {
2509 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2510 RemoteLaneOffset, ReductionArrayTy);
2511 } else {
2512 switch (RI.EvaluationKind) {
2513 case EvalKind::Scalar: {
2514 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2515 // Store the source element value to the dest element address.
2516 Builder.CreateStore(Elem, DestElementAddr);
2517 break;
2518 }
2519 case EvalKind::Complex: {
2521 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2522 Value *SrcReal = Builder.CreateLoad(
2523 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2525 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2526 Value *SrcImg = Builder.CreateLoad(
2527 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2528
2530 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2532 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2533 Builder.CreateStore(SrcReal, DestRealPtr);
2534 Builder.CreateStore(SrcImg, DestImgPtr);
2535 break;
2536 }
2537 case EvalKind::Aggregate: {
2538 Value *SizeVal = Builder.getInt64(
2539 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2541 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2542 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2543 SizeVal, false);
2544 break;
2545 }
2546 };
2547 }
2548
2549 // Step 3.1: Modify reference in dest Reduce list as needed.
2550 // Modifying the reference in Reduce list to point to the newly
2551 // created element. The element is live in the current function
2552 // scope and that of functions it invokes (i.e., reduce_function).
2553 // RemoteReduceData[i] = (void*)&RemoteElem
2554 if (UpdateDestListPtr) {
2556 DestElementAddr, Builder.getPtrTy(),
2557 DestElementAddr->getName() + ".ascast");
2558 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2559 }
2560 }
2561}
2562
2563Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2564 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2565 AttributeList FuncAttrs) {
2566 InsertPointTy SavedIP = Builder.saveIP();
2567 LLVMContext &Ctx = M.getContext();
2569 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2570 /* IsVarArg */ false);
2571 Function *WcFunc =
2573 "_omp_reduction_inter_warp_copy_func", &M);
2574 WcFunc->setAttributes(FuncAttrs);
2575 WcFunc->addParamAttr(0, Attribute::NoUndef);
2576 WcFunc->addParamAttr(1, Attribute::NoUndef);
2577 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2578 Builder.SetInsertPoint(EntryBB);
2579
2580 // ReduceList: thread local Reduce list.
2581 // At the stage of the computation when this function is called, partially
2582 // aggregated values reside in the first lane of every active warp.
2583 Argument *ReduceListArg = WcFunc->getArg(0);
2584 // NumWarps: number of warps active in the parallel region. This could
2585 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2586 Argument *NumWarpsArg = WcFunc->getArg(1);
2587
2588 // This array is used as a medium to transfer, one reduce element at a time,
2589 // the data from the first lane of every warp to lanes in the first warp
2590 // in order to perform the final step of a reduction in a parallel region
2591 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2592 // for reduced latency, as well as to have a distinct copy for concurrently
2593 // executing target regions. The array is declared with common linkage so
2594 // as to be shared across compilation units.
2595 StringRef TransferMediumName =
2596 "__openmp_nvptx_data_transfer_temporary_storage";
2597 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2598 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2599 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2600 if (!TransferMedium) {
2601 TransferMedium = new GlobalVariable(
2602 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2603 UndefValue::get(ArrayTy), TransferMediumName,
2604 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2605 /*AddressSpace=*/3);
2606 }
2607
2608 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2609 Value *GPUThreadID = getGPUThreadID();
2610 // nvptx_lane_id = nvptx_id % warpsize
2611 Value *LaneID = getNVPTXLaneID();
2612 // nvptx_warp_id = nvptx_id / warpsize
2613 Value *WarpID = getNVPTXWarpID();
2614
2615 InsertPointTy AllocaIP =
2618 Type *Arg0Type = ReduceListArg->getType();
2619 Type *Arg1Type = NumWarpsArg->getType();
2620 Builder.restoreIP(AllocaIP);
2621 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2622 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2623 AllocaInst *NumWarpsAlloca =
2624 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2626 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2628 NumWarpsAlloca, Builder.getPtrTy(0),
2629 NumWarpsAlloca->getName() + ".ascast");
2630 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2631 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2632 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2633 InsertPointTy CodeGenIP =
2635 Builder.restoreIP(CodeGenIP);
2636
2637 Value *ReduceList =
2638 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2639
2640 for (auto En : enumerate(ReductionInfos)) {
2641 //
2642 // Warp master copies reduce element to transfer medium in __shared__
2643 // memory.
2644 //
2645 const ReductionInfo &RI = En.value();
2646 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2647 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2648 Type *CType = Builder.getIntNTy(TySize * 8);
2649
2650 unsigned NumIters = RealTySize / TySize;
2651 if (NumIters == 0)
2652 continue;
2653 Value *Cnt = nullptr;
2654 Value *CntAddr = nullptr;
2655 BasicBlock *PrecondBB = nullptr;
2656 BasicBlock *ExitBB = nullptr;
2657 if (NumIters > 1) {
2658 CodeGenIP = Builder.saveIP();
2659 Builder.restoreIP(AllocaIP);
2660 CntAddr =
2661 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2662
2663 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2664 CntAddr->getName() + ".ascast");
2665 Builder.restoreIP(CodeGenIP);
2667 CntAddr,
2668 /*Volatile=*/false);
2669 PrecondBB = BasicBlock::Create(Ctx, "precond");
2670 ExitBB = BasicBlock::Create(Ctx, "exit");
2671 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2672 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2673 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2674 /*Volatile=*/false);
2676 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2677 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2679 }
2680
2681 // kmpc_barrier.
2682 InsertPointOrErrorTy BarrierIP1 =
2683 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2684 omp::Directive::OMPD_unknown,
2685 /* ForceSimpleCall */ false,
2686 /* CheckCancelFlag */ true);
2687 if (!BarrierIP1)
2688 return BarrierIP1.takeError();
2689 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2690 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2691 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2692
2693 // if (lane_id == 0)
2694 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2695 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2697
2698 // Reduce element = LocalReduceList[i]
2699 auto *RedListArrayTy =
2700 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2701 Type *IndexTy = Builder.getIndexTy(
2703 Value *ElemPtrPtr =
2704 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2705 {ConstantInt::get(IndexTy, 0),
2706 ConstantInt::get(IndexTy, En.index())});
2707 // elemptr = ((CopyType*)(elemptrptr)) + I
2708 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2709 if (NumIters > 1)
2710 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2711
2712 // Get pointer to location in transfer medium.
2713 // MediumPtr = &medium[warp_id]
2714 Value *MediumPtr = Builder.CreateInBoundsGEP(
2715 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2716 // elem = *elemptr
2717 //*MediumPtr = elem
2718 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2719 // Store the source element value to the dest element address.
2720 Builder.CreateStore(Elem, MediumPtr,
2721 /*IsVolatile*/ true);
2722 Builder.CreateBr(MergeBB);
2723
2724 // else
2726 Builder.CreateBr(MergeBB);
2727
2728 // endif
2730 InsertPointOrErrorTy BarrierIP2 =
2731 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2732 omp::Directive::OMPD_unknown,
2733 /* ForceSimpleCall */ false,
2734 /* CheckCancelFlag */ true);
2735 if (!BarrierIP2)
2736 return BarrierIP2.takeError();
2737
2738 // Warp 0 copies reduce element from transfer medium
2739 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2740 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2741 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2742
2743 Value *NumWarpsVal =
2744 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2745 // Up to 32 threads in warp 0 are active.
2746 Value *IsActiveThread =
2747 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2748 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2749
2750 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2751
2752 // SecMediumPtr = &medium[tid]
2753 // SrcMediumVal = *SrcMediumPtr
2754 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2755 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2756 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2757 Value *TargetElemPtrPtr =
2758 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2759 {ConstantInt::get(IndexTy, 0),
2760 ConstantInt::get(IndexTy, En.index())});
2761 Value *TargetElemPtrVal =
2762 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2763 Value *TargetElemPtr = TargetElemPtrVal;
2764 if (NumIters > 1)
2765 TargetElemPtr =
2766 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2767
2768 // *TargetElemPtr = SrcMediumVal;
2769 Value *SrcMediumValue =
2770 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2771 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2772 Builder.CreateBr(W0MergeBB);
2773
2774 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2775 Builder.CreateBr(W0MergeBB);
2776
2777 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2778
2779 if (NumIters > 1) {
2780 Cnt = Builder.CreateNSWAdd(
2781 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2782 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2783
2784 auto *CurFn = Builder.GetInsertBlock()->getParent();
2785 emitBranch(PrecondBB);
2786 emitBlock(ExitBB, CurFn);
2787 }
2788 RealTySize %= TySize;
2789 }
2790 }
2791
2793 Builder.restoreIP(SavedIP);
2794
2795 return WcFunc;
2796}
2797
2798Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2799 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2800 AttributeList FuncAttrs) {
2801 LLVMContext &Ctx = M.getContext();
2802 FunctionType *FuncTy =
2804 {Builder.getPtrTy(), Builder.getInt16Ty(),
2805 Builder.getInt16Ty(), Builder.getInt16Ty()},
2806 /* IsVarArg */ false);
2807 Function *SarFunc =
2809 "_omp_reduction_shuffle_and_reduce_func", &M);
2810 SarFunc->setAttributes(FuncAttrs);
2811 SarFunc->addParamAttr(0, Attribute::NoUndef);
2812 SarFunc->addParamAttr(1, Attribute::NoUndef);
2813 SarFunc->addParamAttr(2, Attribute::NoUndef);
2814 SarFunc->addParamAttr(3, Attribute::NoUndef);
2815 SarFunc->addParamAttr(1, Attribute::SExt);
2816 SarFunc->addParamAttr(2, Attribute::SExt);
2817 SarFunc->addParamAttr(3, Attribute::SExt);
2818 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2819 Builder.SetInsertPoint(EntryBB);
2820
2821 // Thread local Reduce list used to host the values of data to be reduced.
2822 Argument *ReduceListArg = SarFunc->getArg(0);
2823 // Current lane id; could be logical.
2824 Argument *LaneIDArg = SarFunc->getArg(1);
2825 // Offset of the remote source lane relative to the current lane.
2826 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2827 // Algorithm version. This is expected to be known at compile time.
2828 Argument *AlgoVerArg = SarFunc->getArg(3);
2829
2830 Type *ReduceListArgType = ReduceListArg->getType();
2831 Type *LaneIDArgType = LaneIDArg->getType();
2832 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2833 Value *ReduceListAlloca = Builder.CreateAlloca(
2834 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2835 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2836 LaneIDArg->getName() + ".addr");
2837 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2838 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2839 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2840 AlgoVerArg->getName() + ".addr");
2841 ArrayType *RedListArrayTy =
2842 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2843
2844 // Create a local thread-private variable to host the Reduce list
2845 // from a remote lane.
2846 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2847 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2848
2850 ReduceListAlloca, ReduceListArgType,
2851 ReduceListAlloca->getName() + ".ascast");
2853 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2854 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2855 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2856 RemoteLaneOffsetAlloca->getName() + ".ascast");
2858 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2860 RemoteReductionListAlloca, Builder.getPtrTy(),
2861 RemoteReductionListAlloca->getName() + ".ascast");
2862
2863 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2864 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2865 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2866 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2867
2868 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2869 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2870 Value *RemoteLaneOffset =
2871 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2872 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2873
2874 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2875
2876 // This loop iterates through the list of reduce elements and copies,
2877 // element by element, from a remote lane in the warp to RemoteReduceList,
2878 // hosted on the thread's stack.
2879 emitReductionListCopy(
2880 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2881 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2882
2883 // The actions to be performed on the Remote Reduce list is dependent
2884 // on the algorithm version.
2885 //
2886 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2887 // LaneId % 2 == 0 && Offset > 0):
2888 // do the reduction value aggregation
2889 //
2890 // The thread local variable Reduce list is mutated in place to host the
2891 // reduced data, which is the aggregated value produced from local and
2892 // remote lanes.
2893 //
2894 // Note that AlgoVer is expected to be a constant integer known at compile
2895 // time.
2896 // When AlgoVer==0, the first conjunction evaluates to true, making
2897 // the entire predicate true during compile time.
2898 // When AlgoVer==1, the second conjunction has only the second part to be
2899 // evaluated during runtime. Other conjunctions evaluates to false
2900 // during compile time.
2901 // When AlgoVer==2, the third conjunction has only the second part to be
2902 // evaluated during runtime. Other conjunctions evaluates to false
2903 // during compile time.
2904 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2905 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2906 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2907 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2908 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2909 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2910 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2911 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2912 Value *RemoteOffsetComp =
2913 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2914 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2915 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2916 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2917
2918 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2919 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2920 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2921
2922 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2925 ReduceList, Builder.getPtrTy());
2926 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2927 RemoteListAddrCast, Builder.getPtrTy());
2928 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2929 ->addFnAttr(Attribute::NoUnwind);
2930 Builder.CreateBr(MergeBB);
2931
2933 Builder.CreateBr(MergeBB);
2934
2936
2937 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2938 // Reduce list.
2939 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2940 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2941 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2942
2943 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2944 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2945 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2946 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2947
2948 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2949 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2950 ReductionInfos, RemoteListAddrCast, ReduceList);
2951 Builder.CreateBr(CpyMergeBB);
2952
2953 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2954 Builder.CreateBr(CpyMergeBB);
2955
2956 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2957
2959
2960 return SarFunc;
2961}
2962
2963Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2964 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2965 AttributeList FuncAttrs) {
2967 LLVMContext &Ctx = M.getContext();
2970 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2971 /* IsVarArg */ false);
2972 Function *LtGCFunc =
2974 "_omp_reduction_list_to_global_copy_func", &M);
2975 LtGCFunc->setAttributes(FuncAttrs);
2976 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2977 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2978 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2979
2980 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2981 Builder.SetInsertPoint(EntryBlock);
2982
2983 // Buffer: global reduction buffer.
2984 Argument *BufferArg = LtGCFunc->getArg(0);
2985 // Idx: index of the buffer.
2986 Argument *IdxArg = LtGCFunc->getArg(1);
2987 // ReduceList: thread local Reduce list.
2988 Argument *ReduceListArg = LtGCFunc->getArg(2);
2989
2990 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2991 BufferArg->getName() + ".addr");
2992 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2993 IdxArg->getName() + ".addr");
2994 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2995 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2997 BufferArgAlloca, Builder.getPtrTy(),
2998 BufferArgAlloca->getName() + ".ascast");
3000 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3001 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3002 ReduceListArgAlloca, Builder.getPtrTy(),
3003 ReduceListArgAlloca->getName() + ".ascast");
3004
3005 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3006 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3007 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3008
3009 Value *LocalReduceList =
3010 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3011 Value *BufferArgVal =
3012 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3013 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3014 Type *IndexTy = Builder.getIndexTy(
3016 for (auto En : enumerate(ReductionInfos)) {
3017 const ReductionInfo &RI = En.value();
3018 auto *RedListArrayTy =
3019 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3020 // Reduce element = LocalReduceList[i]
3021 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3022 RedListArrayTy, LocalReduceList,
3023 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3024 // elemptr = ((CopyType*)(elemptrptr)) + I
3025 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3026
3027 // Global = Buffer.VD[Idx];
3028 Value *BufferVD =
3029 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3031 ReductionsBufferTy, BufferVD, 0, En.index());
3032
3033 switch (RI.EvaluationKind) {
3034 case EvalKind::Scalar: {
3035 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3036 Builder.CreateStore(TargetElement, GlobVal);
3037 break;
3038 }
3039 case EvalKind::Complex: {
3041 RI.ElementType, ElemPtr, 0, 0, ".realp");
3042 Value *SrcReal = Builder.CreateLoad(
3043 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3045 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3046 Value *SrcImg = Builder.CreateLoad(
3047 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3048
3050 RI.ElementType, GlobVal, 0, 0, ".realp");
3052 RI.ElementType, GlobVal, 0, 1, ".imagp");
3053 Builder.CreateStore(SrcReal, DestRealPtr);
3054 Builder.CreateStore(SrcImg, DestImgPtr);
3055 break;
3056 }
3057 case EvalKind::Aggregate: {
3058 Value *SizeVal =
3059 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3061 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3062 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3063 break;
3064 }
3065 }
3066 }
3067
3069 Builder.restoreIP(OldIP);
3070 return LtGCFunc;
3071}
3072
3073Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3074 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3075 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3077 LLVMContext &Ctx = M.getContext();
3080 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3081 /* IsVarArg */ false);
3082 Function *LtGRFunc =
3084 "_omp_reduction_list_to_global_reduce_func", &M);
3085 LtGRFunc->setAttributes(FuncAttrs);
3086 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3087 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3088 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3089
3090 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3091 Builder.SetInsertPoint(EntryBlock);
3092
3093 // Buffer: global reduction buffer.
3094 Argument *BufferArg = LtGRFunc->getArg(0);
3095 // Idx: index of the buffer.
3096 Argument *IdxArg = LtGRFunc->getArg(1);
3097 // ReduceList: thread local Reduce list.
3098 Argument *ReduceListArg = LtGRFunc->getArg(2);
3099
3100 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3101 BufferArg->getName() + ".addr");
3102 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3103 IdxArg->getName() + ".addr");
3104 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3105 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3106 auto *RedListArrayTy =
3107 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3108
3109 // 1. Build a list of reduction variables.
3110 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3111 Value *LocalReduceList =
3112 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3113
3115 BufferArgAlloca, Builder.getPtrTy(),
3116 BufferArgAlloca->getName() + ".ascast");
3118 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3119 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3120 ReduceListArgAlloca, Builder.getPtrTy(),
3121 ReduceListArgAlloca->getName() + ".ascast");
3122 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3123 LocalReduceList, Builder.getPtrTy(),
3124 LocalReduceList->getName() + ".ascast");
3125
3126 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3127 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3128 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3129
3130 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3131 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3132 Type *IndexTy = Builder.getIndexTy(
3134 for (auto En : enumerate(ReductionInfos)) {
3135 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3136 RedListArrayTy, LocalReduceListAddrCast,
3137 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3138 Value *BufferVD =
3139 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3140 // Global = Buffer.VD[Idx];
3142 ReductionsBufferTy, BufferVD, 0, En.index());
3143 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3144 }
3145
3146 // Call reduce_function(GlobalReduceList, ReduceList)
3147 Value *ReduceList =
3148 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3149 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3150 ->addFnAttr(Attribute::NoUnwind);
3152 Builder.restoreIP(OldIP);
3153 return LtGRFunc;
3154}
3155
3156Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3157 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3158 AttributeList FuncAttrs) {
3160 LLVMContext &Ctx = M.getContext();
3163 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3164 /* IsVarArg */ false);
3165 Function *LtGCFunc =
3167 "_omp_reduction_global_to_list_copy_func", &M);
3168 LtGCFunc->setAttributes(FuncAttrs);
3169 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3170 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3171 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3172
3173 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3174 Builder.SetInsertPoint(EntryBlock);
3175
3176 // Buffer: global reduction buffer.
3177 Argument *BufferArg = LtGCFunc->getArg(0);
3178 // Idx: index of the buffer.
3179 Argument *IdxArg = LtGCFunc->getArg(1);
3180 // ReduceList: thread local Reduce list.
3181 Argument *ReduceListArg = LtGCFunc->getArg(2);
3182
3183 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3184 BufferArg->getName() + ".addr");
3185 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3186 IdxArg->getName() + ".addr");
3187 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3188 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3190 BufferArgAlloca, Builder.getPtrTy(),
3191 BufferArgAlloca->getName() + ".ascast");
3193 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3194 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3195 ReduceListArgAlloca, Builder.getPtrTy(),
3196 ReduceListArgAlloca->getName() + ".ascast");
3197 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3198 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3199 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3200
3201 Value *LocalReduceList =
3202 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3203 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3204 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3205 Type *IndexTy = Builder.getIndexTy(
3207 for (auto En : enumerate(ReductionInfos)) {
3208 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3209 auto *RedListArrayTy =
3210 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3211 // Reduce element = LocalReduceList[i]
3212 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3213 RedListArrayTy, LocalReduceList,
3214 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3215 // elemptr = ((CopyType*)(elemptrptr)) + I
3216 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3217 // Global = Buffer.VD[Idx];
3218 Value *BufferVD =
3219 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3221 ReductionsBufferTy, BufferVD, 0, En.index());
3222
3223 switch (RI.EvaluationKind) {
3224 case EvalKind::Scalar: {
3225 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3226 Builder.CreateStore(TargetElement, ElemPtr);
3227 break;
3228 }
3229 case EvalKind::Complex: {
3231 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3232 Value *SrcReal = Builder.CreateLoad(
3233 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3235 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3236 Value *SrcImg = Builder.CreateLoad(
3237 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3238
3240 RI.ElementType, ElemPtr, 0, 0, ".realp");
3242 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3243 Builder.CreateStore(SrcReal, DestRealPtr);
3244 Builder.CreateStore(SrcImg, DestImgPtr);
3245 break;
3246 }
3247 case EvalKind::Aggregate: {
3248 Value *SizeVal =
3252 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3253 SizeVal, false);
3254 break;
3255 }
3256 }
3257 }
3258
3260 Builder.restoreIP(OldIP);
3261 return LtGCFunc;
3262}
3263
3264Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3265 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3266 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3268 LLVMContext &Ctx = M.getContext();
3269 auto *FuncTy = FunctionType::get(
3271 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3272 /* IsVarArg */ false);
3273 Function *LtGRFunc =
3275 "_omp_reduction_global_to_list_reduce_func", &M);
3276 LtGRFunc->setAttributes(FuncAttrs);
3277 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3278 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3279 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3280
3281 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3282 Builder.SetInsertPoint(EntryBlock);
3283
3284 // Buffer: global reduction buffer.
3285 Argument *BufferArg = LtGRFunc->getArg(0);
3286 // Idx: index of the buffer.
3287 Argument *IdxArg = LtGRFunc->getArg(1);
3288 // ReduceList: thread local Reduce list.
3289 Argument *ReduceListArg = LtGRFunc->getArg(2);
3290
3291 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3292 BufferArg->getName() + ".addr");
3293 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3294 IdxArg->getName() + ".addr");
3295 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3296 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3297 ArrayType *RedListArrayTy =
3298 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3299
3300 // 1. Build a list of reduction variables.
3301 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3302 Value *LocalReduceList =
3303 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3304
3306 BufferArgAlloca, Builder.getPtrTy(),
3307 BufferArgAlloca->getName() + ".ascast");
3309 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3310 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3311 ReduceListArgAlloca, Builder.getPtrTy(),
3312 ReduceListArgAlloca->getName() + ".ascast");
3314 LocalReduceList, Builder.getPtrTy(),
3315 LocalReduceList->getName() + ".ascast");
3316
3317 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3318 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3319 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3320
3321 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3322 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3323 Type *IndexTy = Builder.getIndexTy(
3325 for (auto En : enumerate(ReductionInfos)) {
3326 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3327 RedListArrayTy, ReductionList,
3328 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3329 // Global = Buffer.VD[Idx];
3330 Value *BufferVD =
3331 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3333 ReductionsBufferTy, BufferVD, 0, En.index());
3334 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3335 }
3336
3337 // Call reduce_function(ReduceList, GlobalReduceList)
3338 Value *ReduceList =
3339 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3340 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3341 ->addFnAttr(Attribute::NoUnwind);
3343 Builder.restoreIP(OldIP);
3344 return LtGRFunc;
3345}
3346
3347std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3348 std::string Suffix =
3349 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3350 return (Name + Suffix).str();
3351}
3352
3353Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3354 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3355 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3356 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3357 {Builder.getPtrTy(), Builder.getPtrTy()},
3358 /* IsVarArg */ false);
3359 std::string Name = getReductionFuncName(ReducerName);
3360 Function *ReductionFunc =
3362 ReductionFunc->setAttributes(FuncAttrs);
3363 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3364 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3365 BasicBlock *EntryBB =
3366 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3367 Builder.SetInsertPoint(EntryBB);
3368
3369 // Need to alloca memory here and deal with the pointers before getting
3370 // LHS/RHS pointers out
3371 Value *LHSArrayPtr = nullptr;
3372 Value *RHSArrayPtr = nullptr;
3373 Argument *Arg0 = ReductionFunc->getArg(0);
3374 Argument *Arg1 = ReductionFunc->getArg(1);
3375 Type *Arg0Type = Arg0->getType();
3376 Type *Arg1Type = Arg1->getType();
3377
3378 Value *LHSAlloca =
3379 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3380 Value *RHSAlloca =
3381 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3383 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3385 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3386 Builder.CreateStore(Arg0, LHSAddrCast);
3387 Builder.CreateStore(Arg1, RHSAddrCast);
3388 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3389 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3390
3391 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3392 Type *IndexTy = Builder.getIndexTy(
3394 SmallVector<Value *> LHSPtrs, RHSPtrs;
3395 for (auto En : enumerate(ReductionInfos)) {
3396 const ReductionInfo &RI = En.value();
3397 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3398 RedArrayTy, RHSArrayPtr,
3399 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3400 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3402 RHSI8Ptr, RI.PrivateVariable->getType(),
3403 RHSI8Ptr->getName() + ".ascast");
3404
3405 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3406 RedArrayTy, LHSArrayPtr,
3407 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3408 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3410 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3411
3413 LHSPtrs.emplace_back(LHSPtr);
3414 RHSPtrs.emplace_back(RHSPtr);
3415 } else {
3416 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3417 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3418 Value *Reduced;
3419 InsertPointOrErrorTy AfterIP =
3420 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3421 if (!AfterIP)
3422 return AfterIP.takeError();
3423 if (!Builder.GetInsertBlock())
3424 return ReductionFunc;
3425 Builder.CreateStore(Reduced, LHSPtr);
3426 }
3427 }
3428
3430 for (auto En : enumerate(ReductionInfos)) {
3431 unsigned Index = En.index();
3432 const ReductionInfo &RI = En.value();
3433 Value *LHSFixupPtr, *RHSFixupPtr;
3434 Builder.restoreIP(RI.ReductionGenClang(
3435 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3436
3437 // Fix the CallBack code genereated to use the correct Values for the LHS
3438 // and RHS
3439 LHSFixupPtr->replaceUsesWithIf(
3440 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3441 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3442 ReductionFunc;
3443 });
3444 RHSFixupPtr->replaceUsesWithIf(
3445 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3446 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3447 ReductionFunc;
3448 });
3449 }
3450
3452 return ReductionFunc;
3453}
3454
3455static void
3457 bool IsGPU) {
3458 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3459 (void)RI;
3460 assert(RI.Variable && "expected non-null variable");
3461 assert(RI.PrivateVariable && "expected non-null private variable");
3462 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3463 "expected non-null reduction generator callback");
3464 if (!IsGPU) {
3465 assert(
3466 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3467 "expected variables and their private equivalents to have the same "
3468 "type");
3469 }
3470 assert(RI.Variable->getType()->isPointerTy() &&
3471 "expected variables to be pointers");
3472 }
3473}
3474
3476 const LocationDescription &Loc, InsertPointTy AllocaIP,
3477 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3478 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3479 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3480 unsigned ReductionBufNum, Value *SrcLocInfo) {
3481 if (!updateToLocation(Loc))
3482 return InsertPointTy();
3483 Builder.restoreIP(CodeGenIP);
3484 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3485 LLVMContext &Ctx = M.getContext();
3486
3487 // Source location for the ident struct
3488 if (!SrcLocInfo) {
3489 uint32_t SrcLocStrSize;
3490 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3491 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3492 }
3493
3494 if (ReductionInfos.size() == 0)
3495 return Builder.saveIP();
3496
3497 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3498 AttributeList FuncAttrs;
3499 AttrBuilder AttrBldr(Ctx);
3500 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3501 AttrBldr.addAttribute(Attr);
3502 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3503 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3504
3505 CodeGenIP = Builder.saveIP();
3506 Expected<Function *> ReductionResult =
3507 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3508 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3509 if (!ReductionResult)
3510 return ReductionResult.takeError();
3511 Function *ReductionFunc = *ReductionResult;
3512 Builder.restoreIP(CodeGenIP);
3513
3514 // Set the grid value in the config needed for lowering later on
3515 if (GridValue.has_value())
3516 Config.setGridValue(GridValue.value());
3517 else
3518 Config.setGridValue(getGridValue(T, ReductionFunc));
3519
3520 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3521 // RedList, shuffle_reduce_func, interwarp_copy_func);
3522 // or
3523 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3524 Value *Res;
3525
3526 // 1. Build a list of reduction variables.
3527 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3528 auto Size = ReductionInfos.size();
3529 Type *PtrTy = PointerType::getUnqual(Ctx);
3530 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3531 CodeGenIP = Builder.saveIP();
3532 Builder.restoreIP(AllocaIP);
3533 Value *ReductionListAlloca =
3534 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3536 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3537 Builder.restoreIP(CodeGenIP);
3538 Type *IndexTy = Builder.getIndexTy(
3540 for (auto En : enumerate(ReductionInfos)) {
3541 const ReductionInfo &RI = En.value();
3542 Value *ElemPtr = Builder.CreateInBoundsGEP(
3543 RedArrayTy, ReductionList,
3544 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3545 Value *CastElem =
3547 Builder.CreateStore(CastElem, ElemPtr);
3548 }
3549 CodeGenIP = Builder.saveIP();
3550 Function *SarFunc =
3551 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3552 Expected<Function *> CopyResult =
3553 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3554 if (!CopyResult)
3555 return CopyResult.takeError();
3556 Function *WcFunc = *CopyResult;
3557 Builder.restoreIP(CodeGenIP);
3558
3559 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3560
3561 unsigned MaxDataSize = 0;
3562 SmallVector<Type *> ReductionTypeArgs;
3563 for (auto En : enumerate(ReductionInfos)) {
3564 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3565 if (Size > MaxDataSize)
3566 MaxDataSize = Size;
3567 ReductionTypeArgs.emplace_back(En.value().ElementType);
3568 }
3569 Value *ReductionDataSize =
3570 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3571 if (!IsTeamsReduction) {
3572 Value *SarFuncCast =
3574 Value *WcFuncCast =
3576 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3577 WcFuncCast};
3579 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3580 Res = Builder.CreateCall(Pv2Ptr, Args);
3581 } else {
3582 CodeGenIP = Builder.saveIP();
3583 StructType *ReductionsBufferTy = StructType::create(
3584 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3585 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3586 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3587 Function *LtGCFunc = emitListToGlobalCopyFunction(
3588 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3589 Function *LtGRFunc = emitListToGlobalReduceFunction(
3590 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3591 Function *GtLCFunc = emitGlobalToListCopyFunction(
3592 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3593 Function *GtLRFunc = emitGlobalToListReduceFunction(
3594 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3595 Builder.restoreIP(CodeGenIP);
3596
3597 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3598 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3599
3600 Value *Args3[] = {SrcLocInfo,
3601 KernelTeamsReductionPtr,
3602 Builder.getInt32(ReductionBufNum),
3603 ReductionDataSize,
3604 RL,
3605 SarFunc,
3606 WcFunc,
3607 LtGCFunc,
3608 LtGRFunc,
3609 GtLCFunc,
3610 GtLRFunc};
3611
3612 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3613 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3614 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3615 }
3616
3617 // 5. Build if (res == 1)
3618 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3619 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3621 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3622
3623 // 6. Build then branch: where we have reduced values in the master
3624 // thread in each team.
3625 // __kmpc_end_reduce{_nowait}(<gtid>);
3626 // break;
3627 emitBlock(ThenBB, CurFunc);
3628
3629 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3630 for (auto En : enumerate(ReductionInfos)) {
3631 const ReductionInfo &RI = En.value();
3632 Value *LHS = RI.Variable;
3633 Value *RHS =
3635
3637 Value *LHSPtr, *RHSPtr;
3639 &LHSPtr, &RHSPtr, CurFunc));
3640
3641 // Fix the CallBack code genereated to use the correct Values for the LHS
3642 // and RHS
3643 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3644 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3645 ReductionFunc;
3646 });
3647 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3648 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3649 ReductionFunc;
3650 });
3651 } else {
3652 assert(false && "Unhandled ReductionGenCBKind");
3653 }
3654 }
3655 emitBlock(ExitBB, CurFunc);
3656
3658
3659 return Builder.saveIP();
3660}
3661
3663 Type *VoidTy = Type::getVoidTy(M.getContext());
3664 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3665 auto *FuncTy =
3666 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3668 ".omp.reduction.func", &M);
3669}
3670
3673 InsertPointTy AllocaIP,
3674 ArrayRef<ReductionInfo> ReductionInfos,
3675 ArrayRef<bool> IsByRef, bool IsNoWait) {
3676 assert(ReductionInfos.size() == IsByRef.size());
3677 for (const ReductionInfo &RI : ReductionInfos) {
3678 (void)RI;
3679 assert(RI.Variable && "expected non-null variable");
3680 assert(RI.PrivateVariable && "expected non-null private variable");
3681 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3682 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3683 "expected variables and their private equivalents to have the same "
3684 "type");
3685 assert(RI.Variable->getType()->isPointerTy() &&
3686 "expected variables to be pointers");
3687 }
3688
3689 if (!updateToLocation(Loc))
3690 return InsertPointTy();
3691
3692 BasicBlock *InsertBlock = Loc.IP.getBlock();
3693 BasicBlock *ContinuationBlock =
3694 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3695 InsertBlock->getTerminator()->eraseFromParent();
3696
3697 // Create and populate array of type-erased pointers to private reduction
3698 // values.
3699 unsigned NumReductions = ReductionInfos.size();
3700 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3702 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3703
3704 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3705
3706 for (auto En : enumerate(ReductionInfos)) {
3707 unsigned Index = En.index();
3708 const ReductionInfo &RI = En.value();
3709 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3710 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3711 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3712 }
3713
3714 // Emit a call to the runtime function that orchestrates the reduction.
3715 // Declare the reduction function in the process.
3717 Module *Module = Func->getParent();
3718 uint32_t SrcLocStrSize;
3719 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3720 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3721 return RI.AtomicReductionGen;
3722 });
3723 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3724 CanGenerateAtomic
3725 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3726 : IdentFlag(0));
3727 Value *ThreadId = getOrCreateThreadID(Ident);
3728 Constant *NumVariables = Builder.getInt32(NumReductions);
3729 const DataLayout &DL = Module->getDataLayout();
3730 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3731 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3732 Function *ReductionFunc = getFreshReductionFunc(*Module);
3733 Value *Lock = getOMPCriticalRegionLock(".reduction");
3735 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3736 : RuntimeFunction::OMPRTL___kmpc_reduce);
3737 CallInst *ReduceCall =
3738 Builder.CreateCall(ReduceFunc,
3739 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3740 ReductionFunc, Lock},
3741 "reduce");
3742
3743 // Create final reduction entry blocks for the atomic and non-atomic case.
3744 // Emit IR that dispatches control flow to one of the blocks based on the
3745 // reduction supporting the atomic mode.
3746 BasicBlock *NonAtomicRedBlock =
3747 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3748 BasicBlock *AtomicRedBlock =
3749 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3750 SwitchInst *Switch =
3751 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3752 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3753 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3754
3755 // Populate the non-atomic reduction using the elementwise reduction function.
3756 // This loads the elements from the global and private variables and reduces
3757 // them before storing back the result to the global variable.
3758 Builder.SetInsertPoint(NonAtomicRedBlock);
3759 for (auto En : enumerate(ReductionInfos)) {
3760 const ReductionInfo &RI = En.value();
3762 // We have one less load for by-ref case because that load is now inside of
3763 // the reduction region
3764 Value *RedValue = RI.Variable;
3765 if (!IsByRef[En.index()]) {
3766 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3767 "red.value." + Twine(En.index()));
3768 }
3769 Value *PrivateRedValue =
3771 "red.private.value." + Twine(En.index()));
3772 Value *Reduced;
3773 InsertPointOrErrorTy AfterIP =
3774 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3775 if (!AfterIP)
3776 return AfterIP.takeError();
3777 Builder.restoreIP(*AfterIP);
3778
3779 if (!Builder.GetInsertBlock())
3780 return InsertPointTy();
3781 // for by-ref case, the load is inside of the reduction region
3782 if (!IsByRef[En.index()])
3783 Builder.CreateStore(Reduced, RI.Variable);
3784 }
3785 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3786 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3787 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3788 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3789 Builder.CreateBr(ContinuationBlock);
3790
3791 // Populate the atomic reduction using the atomic elementwise reduction
3792 // function. There are no loads/stores here because they will be happening
3793 // inside the atomic elementwise reduction.
3794 Builder.SetInsertPoint(AtomicRedBlock);
3795 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3796 for (const ReductionInfo &RI : ReductionInfos) {
3799 if (!AfterIP)
3800 return AfterIP.takeError();
3801 Builder.restoreIP(*AfterIP);
3802 if (!Builder.GetInsertBlock())
3803 return InsertPointTy();
3804 }
3805 Builder.CreateBr(ContinuationBlock);
3806 } else {
3808 }
3809
3810 // Populate the outlined reduction function using the elementwise reduction
3811 // function. Partial values are extracted from the type-erased array of
3812 // pointers to private variables.
3813 BasicBlock *ReductionFuncBlock =
3814 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3815 Builder.SetInsertPoint(ReductionFuncBlock);
3816 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3817 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3818
3819 for (auto En : enumerate(ReductionInfos)) {
3820 const ReductionInfo &RI = En.value();
3822 RedArrayTy, LHSArrayPtr, 0, En.index());
3823 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3824 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3825 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3827 RedArrayTy, RHSArrayPtr, 0, En.index());
3828 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3829 Value *RHSPtr =
3831 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3832 Value *Reduced;
3833 InsertPointOrErrorTy AfterIP =
3834 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3835 if (!AfterIP)
3836 return AfterIP.takeError();
3837 Builder.restoreIP(*AfterIP);
3838 if (!Builder.GetInsertBlock())
3839 return InsertPointTy();
3840 // store is inside of the reduction region when using by-ref
3841 if (!IsByRef[En.index()])
3842 Builder.CreateStore(Reduced, LHSPtr);
3843 }
3845
3846 Builder.SetInsertPoint(ContinuationBlock);
3847 return Builder.saveIP();
3848}
3849
3852 BodyGenCallbackTy BodyGenCB,
3853 FinalizeCallbackTy FiniCB) {
3854 if (!updateToLocation(Loc))
3855 return Loc.IP;
3856
3857 Directive OMPD = Directive::OMPD_master;
3858 uint32_t SrcLocStrSize;
3859 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3860 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3861 Value *ThreadId = getOrCreateThreadID(Ident);
3862 Value *Args[] = {Ident, ThreadId};
3863
3864 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3865 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3866
3867 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3868 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3869
3870 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3871 /*Conditional*/ true, /*hasFinalize*/ true);
3872}
3873
3876 BodyGenCallbackTy BodyGenCB,
3877 FinalizeCallbackTy FiniCB, Value *Filter) {
3878 if (!updateToLocation(Loc))
3879 return Loc.IP;
3880
3881 Directive OMPD = Directive::OMPD_masked;
3882 uint32_t SrcLocStrSize;
3883 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3884 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3885 Value *ThreadId = getOrCreateThreadID(Ident);
3886 Value *Args[] = {Ident, ThreadId, Filter};
3887 Value *ArgsEnd[] = {Ident, ThreadId};
3888
3889 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3890 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3891
3892 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3893 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3894
3895 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3896 /*Conditional*/ true, /*hasFinalize*/ true);
3897}
3898
3900 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3901 BasicBlock *PostInsertBefore, const Twine &Name) {
3902 Module *M = F->getParent();
3903 LLVMContext &Ctx = M->getContext();
3904 Type *IndVarTy = TripCount->getType();
3905
3906 // Create the basic block structure.
3907 BasicBlock *Preheader =
3908 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3909 BasicBlock *Header =
3910 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3911 BasicBlock *Cond =
3912 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3913 BasicBlock *Body =
3914 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3915 BasicBlock *Latch =
3916 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3917 BasicBlock *Exit =
3918 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3919 BasicBlock *After =
3920 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3921
3922 // Use specified DebugLoc for new instructions.
3924
3925 Builder.SetInsertPoint(Preheader);
3926 Builder.CreateBr(Header);
3927
3928 Builder.SetInsertPoint(Header);
3929 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3930 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3932
3934 Value *Cmp =
3935 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3936 Builder.CreateCondBr(Cmp, Body, Exit);
3937
3938 Builder.SetInsertPoint(Body);
3939 Builder.CreateBr(Latch);
3940
3941 Builder.SetInsertPoint(Latch);
3942 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3943 "omp_" + Name + ".next", /*HasNUW=*/true);
3944 Builder.CreateBr(Header);
3945 IndVarPHI->addIncoming(Next, Latch);
3946
3947 Builder.SetInsertPoint(Exit);
3949
3950 // Remember and return the canonical control flow.
3951 LoopInfos.emplace_front();
3952 CanonicalLoopInfo *CL = &LoopInfos.front();
3953
3954 CL->Header = Header;
3955 CL->Cond = Cond;
3956 CL->Latch = Latch;
3957 CL->Exit = Exit;
3958
3959#ifndef NDEBUG
3960 CL->assertOK();
3961#endif
3962 return CL;
3963}
3964
3967 LoopBodyGenCallbackTy BodyGenCB,
3968 Value *TripCount, const Twine &Name) {
3969 BasicBlock *BB = Loc.IP.getBlock();
3970 BasicBlock *NextBB = BB->getNextNode();
3971
3972 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3973 NextBB, NextBB, Name);
3974 BasicBlock *After = CL->getAfter();
3975
3976 // If location is not set, don't connect the loop.
3977 if (updateToLocation(Loc)) {
3978 // Split the loop at the insertion point: Branch to the preheader and move
3979 // every following instruction to after the loop (the After BB). Also, the
3980 // new successor is the loop's after block.
3981 spliceBB(Builder, After, /*CreateBranch=*/false);
3983 }
3984
3985 // Emit the body content. We do it after connecting the loop to the CFG to
3986 // avoid that the callback encounters degenerate BBs.
3987 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
3988 return Err;
3989
3990#ifndef NDEBUG
3991 CL->assertOK();
3992#endif
3993 return CL;
3994}
3995
3997 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3998 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3999 InsertPointTy ComputeIP, const Twine &Name) {
4000
4001 // Consider the following difficulties (assuming 8-bit signed integers):
4002 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4003 // DO I = 1, 100, 50
4004 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4005 // DO I = 100, 0, -128
4006
4007 // Start, Stop and Step must be of the same integer type.
4008 auto *IndVarTy = cast<IntegerType>(Start->getType());
4009 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4010 assert(IndVarTy == Step->getType() && "Step type mismatch");
4011
4012 LocationDescription ComputeLoc =
4013 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4014 updateToLocation(ComputeLoc);
4015
4016 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4017 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4018
4019 // Like Step, but always positive.
4020 Value *Incr = Step;
4021
4022 // Distance between Start and Stop; always positive.
4023 Value *Span;
4024
4025 // Condition whether there are no iterations are executed at all, e.g. because
4026 // UB < LB.
4027 Value *ZeroCmp;
4028
4029 if (IsSigned) {
4030 // Ensure that increment is positive. If not, negate and invert LB and UB.
4031 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4032 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4033 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4034 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4035 Span = Builder.CreateSub(UB, LB, "", false, true);
4036 ZeroCmp = Builder.CreateICmp(
4037 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4038 } else {
4039 Span = Builder.CreateSub(Stop, Start, "", true);
4040 ZeroCmp = Builder.CreateICmp(
4041 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4042 }
4043
4044 Value *CountIfLooping;
4045 if (InclusiveStop) {
4046 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4047 } else {
4048 // Avoid incrementing past stop since it could overflow.
4049 Value *CountIfTwo = Builder.CreateAdd(
4050 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4051 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4052 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4053 }
4054 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4055 "omp_" + Name + ".tripcount");
4056
4057 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4058 Builder.restoreIP(CodeGenIP);
4059 Value *Span = Builder.CreateMul(IV, Step);
4060 Value *IndVar = Builder.CreateAdd(Span, Start);
4061 return BodyGenCB(Builder.saveIP(), IndVar);
4062 };
4063 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4064 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4065}
4066
4067// Returns an LLVM function to call for initializing loop bounds using OpenMP
4068// static scheduling depending on `type`. Only i32 and i64 are supported by the
4069// runtime. Always interpret integers as unsigned similarly to
4070// CanonicalLoopInfo.
4072 OpenMPIRBuilder &OMPBuilder) {
4073 unsigned Bitwidth = Ty->getIntegerBitWidth();
4074 if (Bitwidth == 32)
4075 return OMPBuilder.getOrCreateRuntimeFunction(
4076 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4077 if (Bitwidth == 64)
4078 return OMPBuilder.getOrCreateRuntimeFunction(
4079 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4080 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4081}
4082
4084OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4085 InsertPointTy AllocaIP,
4086 bool NeedsBarrier) {
4087 assert(CLI->isValid() && "Requires a valid canonical loop");
4088 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4089 "Require dedicated allocate IP");
4090
4091 // Set up the source location value for OpenMP runtime.
4094
4095 uint32_t SrcLocStrSize;
4096 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4097 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4098
4099 // Declare useful OpenMP runtime functions.
4100 Value *IV = CLI->getIndVar();
4101 Type *IVTy = IV->getType();
4102 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4103 FunctionCallee StaticFini =
4104 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4105
4106 // Allocate space for computed loop bounds as expected by the "init" function.
4107 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4108
4109 Type *I32Type = Type::getInt32Ty(M.getContext());
4110 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4111 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4112 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4113 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4114
4115 // At the end of the preheader, prepare for calling the "init" function by
4116 // storing the current loop bounds into the allocated space. A canonical loop
4117 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4118 // and produces an inclusive upper bound.
4120 Constant *Zero = ConstantInt::get(IVTy, 0);
4121 Constant *One = ConstantInt::get(IVTy, 1);
4122 Builder.CreateStore(Zero, PLowerBound);
4123 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4124 Builder.CreateStore(UpperBound, PUpperBound);
4125 Builder.CreateStore(One, PStride);
4126
4127 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4128
4129 Constant *SchedulingType = ConstantInt::get(
4130 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4131
4132 // Call the "init" function and update the trip count of the loop with the
4133 // value it produced.
4134 Builder.CreateCall(StaticInit,
4135 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4136 PUpperBound, PStride, One, Zero});
4137 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4138 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4139 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4140 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4141 CLI->setTripCount(TripCount);
4142
4143 // Update all uses of the induction variable except the one in the condition
4144 // block that compares it with the actual upper bound, and the increment in
4145 // the latch block.
4146
4147 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4149 CLI->getBody()->getFirstInsertionPt());
4151 return Builder.CreateAdd(OldIV, LowerBound);
4152 });
4153
4154 // In the "exit" block, call the "fini" function.
4156 CLI->getExit()->getTerminator()->getIterator());
4157 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4158
4159 // Add the barrier if requested.
4160 if (NeedsBarrier) {
4161 InsertPointOrErrorTy BarrierIP =
4162 createBarrier(LocationDescription(Builder.saveIP(), DL),
4163 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4164 /* CheckCancelFlag */ false);
4165 if (!BarrierIP)
4166 return BarrierIP.takeError();
4167 }
4168
4169 InsertPointTy AfterIP = CLI->getAfterIP();
4170 CLI->invalidate();
4171
4172 return AfterIP;
4173}
4174
4176OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4177 CanonicalLoopInfo *CLI,
4178 InsertPointTy AllocaIP,
4179 bool NeedsBarrier,
4180 Value *ChunkSize) {
4181 assert(CLI->isValid() && "Requires a valid canonical loop");
4182 assert(ChunkSize && "Chunk size is required");
4183
4184 LLVMContext &Ctx = CLI->getFunction()->getContext();
4185 Value *IV = CLI->getIndVar();
4186 Value *OrigTripCount = CLI->getTripCount();
4187 Type *IVTy = IV->getType();
4188 assert(IVTy->getIntegerBitWidth() <= 64 &&
4189 "Max supported tripcount bitwidth is 64 bits");
4190 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4191 : Type::getInt64Ty(Ctx);
4192 Type *I32Type = Type::getInt32Ty(M.getContext());
4193 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4194 Constant *One = ConstantInt::get(InternalIVTy, 1);
4195
4196 // Declare useful OpenMP runtime functions.
4197 FunctionCallee StaticInit =
4198 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4199 FunctionCallee StaticFini =
4200 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4201
4202 // Allocate space for computed loop bounds as expected by the "init" function.
4203 Builder.restoreIP(AllocaIP);
4205 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4206 Value *PLowerBound =
4207 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4208 Value *PUpperBound =
4209 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4210 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4211
4212 // Set up the source location value for the OpenMP runtime.
4215
4216 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4217 Value *CastedChunkSize =
4218 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4219 Value *CastedTripCount =
4220 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4221
4222 Constant *SchedulingType = ConstantInt::get(
4223 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4224 Builder.CreateStore(Zero, PLowerBound);
4225 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4226 Builder.CreateStore(OrigUpperBound, PUpperBound);
4227 Builder.CreateStore(One, PStride);
4228
4229 // Call the "init" function and update the trip count of the loop with the
4230 // value it produced.
4231 uint32_t SrcLocStrSize;
4232 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4233 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4234 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4235 Builder.CreateCall(StaticInit,
4236 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4237 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4238 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4239 /*pstride=*/PStride, /*incr=*/One,
4240 /*chunk=*/CastedChunkSize});
4241
4242 // Load values written by the "init" function.
4243 Value *FirstChunkStart =
4244 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4245 Value *FirstChunkStop =
4246 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4247 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4248 Value *ChunkRange =
4249 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4250 Value *NextChunkStride =
4251 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4252
4253 // Create outer "dispatch" loop for enumerating the chunks.
4254 BasicBlock *DispatchEnter = splitBB(Builder, true);
4255 Value *DispatchCounter;
4256
4257 // It is safe to assume this didn't return an error because the callback
4258 // passed into createCanonicalLoop is the only possible error source, and it
4259 // always returns success.
4261 {Builder.saveIP(), DL},
4262 [&](InsertPointTy BodyIP, Value *Counter) {
4263 DispatchCounter = Counter;
4264 return Error::success();
4265 },
4266 FirstChunkStart, CastedTripCount, NextChunkStride,
4267 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4268 "dispatch"));
4269
4270 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4271 // not have to preserve the canonical invariant.
4272 BasicBlock *DispatchBody = DispatchCLI->getBody();
4273 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4274 BasicBlock *DispatchExit = DispatchCLI->getExit();
4275 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4276 DispatchCLI->invalidate();
4277
4278 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4279 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4280 redirectTo(CLI->getExit(), DispatchLatch, DL);
4281 redirectTo(DispatchBody, DispatchEnter, DL);
4282
4283 // Prepare the prolog of the chunk loop.
4286
4287 // Compute the number of iterations of the chunk loop.
4289 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4290 Value *IsLastChunk =
4291 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4292 Value *CountUntilOrigTripCount =
4293 Builder.CreateSub(CastedTripCount, DispatchCounter);
4294 Value *ChunkTripCount = Builder.CreateSelect(
4295 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4296 Value *BackcastedChunkTC =
4297 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4298 CLI->setTripCount(BackcastedChunkTC);
4299
4300 // Update all uses of the induction variable except the one in the condition
4301 // block that compares it with the actual upper bound, and the increment in
4302 // the latch block.
4303 Value *BackcastedDispatchCounter =
4304 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4305 CLI->mapIndVar([&](Instruction *) -> Value * {
4306 Builder.restoreIP(CLI->getBodyIP());
4307 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4308 });
4309
4310 // In the "exit" block, call the "fini" function.
4311 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4312 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4313
4314 // Add the barrier if requested.
4315 if (NeedsBarrier) {
4316 InsertPointOrErrorTy AfterIP =
4317 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4318 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4319 if (!AfterIP)
4320 return AfterIP.takeError();
4321 }
4322
4323#ifndef NDEBUG
4324 // Even though we currently do not support applying additional methods to it,
4325 // the chunk loop should remain a canonical loop.
4326 CLI->assertOK();
4327#endif
4328
4329 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4330}
4331
4332// Returns an LLVM function to call for executing an OpenMP static worksharing
4333// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4334// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4335static FunctionCallee
4337 WorksharingLoopType LoopType) {
4338 unsigned Bitwidth = Ty->getIntegerBitWidth();
4339 Module &M = OMPBuilder->M;
4340 switch (LoopType) {
4341 case WorksharingLoopType::ForStaticLoop:
4342 if (Bitwidth == 32)
4343 return OMPBuilder->getOrCreateRuntimeFunction(
4344 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4345 if (Bitwidth == 64)
4346 return OMPBuilder->getOrCreateRuntimeFunction(
4347 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4348 break;
4349 case WorksharingLoopType::DistributeStaticLoop:
4350 if (Bitwidth == 32)
4351 return OMPBuilder->getOrCreateRuntimeFunction(
4352 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4353 if (Bitwidth == 64)
4354 return OMPBuilder->getOrCreateRuntimeFunction(
4355 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4356 break;
4357 case WorksharingLoopType::DistributeForStaticLoop:
4358 if (Bitwidth == 32)
4359 return OMPBuilder->getOrCreateRuntimeFunction(
4360 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4361 if (Bitwidth == 64)
4362 return OMPBuilder->getOrCreateRuntimeFunction(
4363 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4364 break;
4365 }
4366 if (Bitwidth != 32 && Bitwidth != 64) {
4367 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4368 }
4369 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4370}
4371
4372// Inserts a call to proper OpenMP Device RTL function which handles
4373// loop worksharing.
4375 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4376 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4377 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4378 Type *TripCountTy = TripCount->getType();
4379 Module &M = OMPBuilder->M;
4380 IRBuilder<> &Builder = OMPBuilder->Builder;
4381 FunctionCallee RTLFn =
4382 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4383 SmallVector<Value *, 8> RealArgs;
4384 RealArgs.push_back(Ident);
4385 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4386 RealArgs.push_back(LoopBodyArg);
4387 RealArgs.push_back(TripCount);
4388 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4389 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4390 Builder.CreateCall(RTLFn, RealArgs);
4391 return;
4392 }
4393 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4394 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4395 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4396 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4397
4398 RealArgs.push_back(
4399 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4400 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4401 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4402 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4403 }
4404
4405 Builder.CreateCall(RTLFn, RealArgs);
4406}
4407
4408static void
4410 CanonicalLoopInfo *CLI, Value *Ident,
4411 Function &OutlinedFn, Type *ParallelTaskPtr,
4412 const SmallVector<Instruction *, 4> &ToBeDeleted,
4413 WorksharingLoopType LoopType) {
4414 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4415 BasicBlock *Preheader = CLI->getPreheader();
4416 Value *TripCount = CLI->getTripCount();
4417
4418 // After loop body outling, the loop body contains only set up
4419 // of loop body argument structure and the call to the outlined
4420 // loop body function. Firstly, we need to move setup of loop body args
4421 // into loop preheader.
4422 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4423 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4424
4425 // The next step is to remove the whole loop. We do not it need anymore.
4426 // That's why make an unconditional branch from loop preheader to loop
4427 // exit block
4428 Builder.restoreIP({Preheader, Preheader->end()});
4429 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4430 Preheader->getTerminator()->eraseFromParent();
4431 Builder.CreateBr(CLI->getExit());
4432
4433 // Delete dead loop blocks
4434 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4435 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4436 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4437 CleanUpInfo.EntryBB = CLI->getHeader();
4438 CleanUpInfo.ExitBB = CLI->getExit();
4439 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4440 DeleteDeadBlocks(BlocksToBeRemoved);
4441
4442 // Find the instruction which corresponds to loop body argument structure
4443 // and remove the call to loop body function instruction.
4444 Value *LoopBodyArg;
4445 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4446 assert(OutlinedFnUser &&
4447 "Expected unique undroppable user of outlined function");
4448 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4449 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4450 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4451 "Expected outlined function call to be located in loop preheader");
4452 // Check in case no argument structure has been passed.
4453 if (OutlinedFnCallInstruction->arg_size() > 1)
4454 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4455 else
4456 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4457 OutlinedFnCallInstruction->eraseFromParent();
4458
4459 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4460 LoopBodyArg, ParallelTaskPtr, TripCount,
4461 OutlinedFn);
4462
4463 for (auto &ToBeDeletedItem : ToBeDeleted)
4464 ToBeDeletedItem->eraseFromParent();
4465 CLI->invalidate();
4466}
4467
4469OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4470 InsertPointTy AllocaIP,
4471 WorksharingLoopType LoopType) {
4472 uint32_t SrcLocStrSize;
4473 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4474 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4475
4476 OutlineInfo OI;
4477 OI.OuterAllocaBB = CLI->getPreheader();
4478 Function *OuterFn = CLI->getPreheader()->getParent();
4479
4480 // Instructions which need to be deleted at the end of code generation
4482
4483 OI.OuterAllocaBB = AllocaIP.getBlock();
4484
4485 // Mark the body loop as region which needs to be extracted
4486 OI.EntryBB = CLI->getBody();
4487 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4488 "omp.prelatch", true);
4489
4490 // Prepare loop body for extraction
4491 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4492
4493 // Insert new loop counter variable which will be used only in loop
4494 // body.
4495 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4496 Instruction *NewLoopCntLoad =
4497 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4498 // New loop counter instructions are redundant in the loop preheader when
4499 // code generation for workshare loop is finshed. That's why mark them as
4500 // ready for deletion.
4501 ToBeDeleted.push_back(NewLoopCntLoad);
4502 ToBeDeleted.push_back(NewLoopCnt);
4503
4504 // Analyse loop body region. Find all input variables which are used inside
4505 // loop body region.
4506 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4508 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4509 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4510 ParallelRegionBlockSet.end());
4511
4512 CodeExtractorAnalysisCache CEAC(*OuterFn);
4513 CodeExtractor Extractor(Blocks,
4514 /* DominatorTree */ nullptr,
4515 /* AggregateArgs */ true,
4516 /* BlockFrequencyInfo */ nullptr,
4517 /* BranchProbabilityInfo */ nullptr,
4518 /* AssumptionCache */ nullptr,
4519 /* AllowVarArgs */ true,
4520 /* AllowAlloca */ true,
4521 /* AllocationBlock */ CLI->getPreheader(),
4522 /* Suffix */ ".omp_wsloop",
4523 /* AggrArgsIn0AddrSpace */ true);
4524
4525 BasicBlock *CommonExit = nullptr;
4526 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4527
4528 // Find allocas outside the loop body region which are used inside loop
4529 // body
4530 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4531
4532 // We need to model loop body region as the function f(cnt, loop_arg).
4533 // That's why we replace loop induction variable by the new counter
4534 // which will be one of loop body function argument
4536 CLI->getIndVar()->user_end());
4537 for (auto Use : Users) {
4538 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4539 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4540 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4541 }
4542 }
4543 }
4544 // Make sure that loop counter variable is not merged into loop body
4545 // function argument structure and it is passed as separate variable
4546 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4547
4548 // PostOutline CB is invoked when loop body function is outlined and
4549 // loop body is replaced by call to outlined function. We need to add
4550 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4551 // function will handle loop control logic.
4552 //
4553 OI.PostOutlineCB = [=, ToBeDeletedVec =
4554 std::move(ToBeDeleted)](Function &OutlinedFn) {
4555 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4556 ToBeDeletedVec, LoopType);
4557 };
4558 addOutlineInfo(std::move(OI));
4559 return CLI->getAfterIP();
4560}
4561
4564 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4565 bool HasSimdModifier, bool HasMonotonicModifier,
4566 bool HasNonmonotonicModifier, bool HasOrderedClause,
4567 WorksharingLoopType LoopType) {
4568 if (Config.isTargetDevice())
4569 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4570 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4571 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4572 HasNonmonotonicModifier, HasOrderedClause);
4573
4574 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4575 OMPScheduleType::ModifierOrdered;
4576 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4577 case OMPScheduleType::BaseStatic:
4578 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4579 if (IsOrdered)
4580 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4581 NeedsBarrier, ChunkSize);
4582 // FIXME: Monotonicity ignored?
4583 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4584
4585 case OMPScheduleType::BaseStaticChunked:
4586 if (IsOrdered)
4587 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4588 NeedsBarrier, ChunkSize);
4589 // FIXME: Monotonicity ignored?
4590 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4591 ChunkSize);
4592
4593 case OMPScheduleType::BaseRuntime:
4594 case OMPScheduleType::BaseAuto:
4595 case OMPScheduleType::BaseGreedy:
4596 case OMPScheduleType::BaseBalanced:
4597 case OMPScheduleType::BaseSteal:
4598 case OMPScheduleType::BaseGuidedSimd:
4599 case OMPScheduleType::BaseRuntimeSimd:
4600 assert(!ChunkSize &&
4601 "schedule type does not support user-defined chunk sizes");
4602 [[fallthrough]];
4603 case OMPScheduleType::BaseDynamicChunked:
4604 case OMPScheduleType::BaseGuidedChunked:
4605 case OMPScheduleType::BaseGuidedIterativeChunked:
4606 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4607 case OMPScheduleType::BaseStaticBalancedChunked:
4608 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4609 NeedsBarrier, ChunkSize);
4610
4611 default:
4612 llvm_unreachable("Unknown/unimplemented schedule kind");
4613 }
4614}
4615
4616/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4617/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4618/// the runtime. Always interpret integers as unsigned similarly to
4619/// CanonicalLoopInfo.
4620static FunctionCallee
4622 unsigned Bitwidth = Ty->getIntegerBitWidth();
4623 if (Bitwidth == 32)
4624 return OMPBuilder.getOrCreateRuntimeFunction(
4625 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4626 if (Bitwidth == 64)
4627 return OMPBuilder.getOrCreateRuntimeFunction(
4628 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4629 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4630}
4631
4632/// Returns an LLVM function to call for updating the next loop using OpenMP
4633/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4634/// the runtime. Always interpret integers as unsigned similarly to
4635/// CanonicalLoopInfo.
4636static FunctionCallee
4638 unsigned Bitwidth = Ty->getIntegerBitWidth();
4639 if (Bitwidth == 32)
4640 return OMPBuilder.getOrCreateRuntimeFunction(
4641 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4642 if (Bitwidth == 64)
4643 return OMPBuilder.getOrCreateRuntimeFunction(
4644 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4645 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4646}
4647
4648/// Returns an LLVM function to call for finalizing the dynamic loop using
4649/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4650/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4651static FunctionCallee
4653 unsigned Bitwidth = Ty->getIntegerBitWidth();
4654 if (Bitwidth == 32)
4655 return OMPBuilder.getOrCreateRuntimeFunction(
4656 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4657 if (Bitwidth == 64)
4658 return OMPBuilder.getOrCreateRuntimeFunction(
4659 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4660 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4661}
4662
4664OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4665 InsertPointTy AllocaIP,
4666 OMPScheduleType SchedType,
4667 bool NeedsBarrier, Value *Chunk) {
4668 assert(CLI->isValid() && "Requires a valid canonical loop");
4669 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4670 "Require dedicated allocate IP");
4672 "Require valid schedule type");
4673
4674 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4675 OMPScheduleType::ModifierOrdered;
4676
4677 // Set up the source location value for OpenMP runtime.
4679
4680 uint32_t SrcLocStrSize;
4681 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4682 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4683
4684 // Declare useful OpenMP runtime functions.
4685 Value *IV = CLI->getIndVar();
4686 Type *IVTy = IV->getType();
4687 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4688 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4689
4690 // Allocate space for computed loop bounds as expected by the "init" function.
4691 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4692 Type *I32Type = Type::getInt32Ty(M.getContext());
4693 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4694 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4695 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4696 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4697
4698 // At the end of the preheader, prepare for calling the "init" function by
4699 // storing the current loop bounds into the allocated space. A canonical loop
4700 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4701 // and produces an inclusive upper bound.
4702 BasicBlock *PreHeader = CLI->getPreheader();
4703 Builder.SetInsertPoint(PreHeader->getTerminator());
4704 Constant *One = ConstantInt::get(IVTy, 1);
4705 Builder.CreateStore(One, PLowerBound);
4706 Value *UpperBound = CLI->getTripCount();
4707 Builder.CreateStore(UpperBound, PUpperBound);
4708 Builder.CreateStore(One, PStride);
4709
4710 BasicBlock *Header = CLI->getHeader();
4711 BasicBlock *Exit = CLI->getExit();
4712 BasicBlock *Cond = CLI->getCond();
4713 BasicBlock *Latch = CLI->getLatch();
4714 InsertPointTy AfterIP = CLI->getAfterIP();
4715
4716 // The CLI will be "broken" in the code below, as the loop is no longer
4717 // a valid canonical loop.
4718
4719 if (!Chunk)
4720 Chunk = One;
4721
4722 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4723
4724 Constant *SchedulingType =
4725 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4726
4727 // Call the "init" function.
4728 Builder.CreateCall(DynamicInit,
4729 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4730 UpperBound, /* step */ One, Chunk});
4731
4732 // An outer loop around the existing one.
4733 BasicBlock *OuterCond = BasicBlock::Create(
4734 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4735 PreHeader->getParent());
4736 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4737 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4738 Value *Res =
4739 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4740 PLowerBound, PUpperBound, PStride});
4741 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4742 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4743 Value *LowerBound =
4744 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4745 Builder.CreateCondBr(MoreWork, Header, Exit);
4746
4747 // Change PHI-node in loop header to use outer cond rather than preheader,
4748 // and set IV to the LowerBound.
4749 Instruction *Phi = &Header->front();
4750 auto *PI = cast<PHINode>(Phi);
4751 PI->setIncomingBlock(0, OuterCond);
4752 PI->setIncomingValue(0, LowerBound);
4753
4754 // Then set the pre-header to jump to the OuterCond
4755 Instruction *Term = PreHeader->getTerminator();
4756 auto *Br = cast<BranchInst>(Term);
4757 Br->setSuccessor(0, OuterCond);
4758
4759 // Modify the inner condition:
4760 // * Use the UpperBound returned from the DynamicNext call.
4761 // * jump to the loop outer loop when done with one of the inner loops.
4762 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4763 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4765 auto *CI = cast<CmpInst>(Comp);
4766 CI->setOperand(1, UpperBound);
4767 // Redirect the inner exit to branch to outer condition.
4768 Instruction *Branch = &Cond->back();
4769 auto *BI = cast<BranchInst>(Branch);
4770 assert(BI->getSuccessor(1) == Exit);
4771 BI->setSuccessor(1, OuterCond);
4772
4773 // Call the "fini" function if "ordered" is present in wsloop directive.
4774 if (Ordered) {
4775 Builder.SetInsertPoint(&Latch->back());
4776 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4777 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4778 }
4779
4780 // Add the barrier if requested.
4781 if (NeedsBarrier) {
4782 Builder.SetInsertPoint(&Exit->back());
4783 InsertPointOrErrorTy BarrierIP =
4784 createBarrier(LocationDescription(Builder.saveIP(), DL),
4785 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4786 /* CheckCancelFlag */ false);
4787 if (!BarrierIP)
4788 return BarrierIP.takeError();
4789 }
4790
4791 CLI->invalidate();
4792 return AfterIP;
4793}
4794
4795/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4796/// after this \p OldTarget will be orphaned.
4798 BasicBlock *NewTarget, DebugLoc DL) {
4799 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4800 redirectTo(Pred, NewTarget, DL);
4801}
4802
4803/// Determine which blocks in \p BBs are reachable from outside and remove the
4804/// ones that are not reachable from the function.
4806 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4807 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4808 for (Use &U : BB->uses()) {
4809 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4810 if (!UseInst)
4811 continue;
4812 if (BBsToErase.count(UseInst->getParent()))
4813 continue;
4814 return true;
4815 }
4816 return false;
4817 };
4818
4819 while (BBsToErase.remove_if(HasRemainingUses)) {
4820 // Try again if anything was removed.
4821 }
4822
4823 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4824 DeleteDeadBlocks(BBVec);
4825}
4826
4829 InsertPointTy ComputeIP) {
4830 assert(Loops.size() >= 1 && "At least one loop required");
4831 size_t NumLoops = Loops.size();
4832
4833 // Nothing to do if there is already just one loop.
4834 if (NumLoops == 1)
4835 return Loops.front();
4836
4837 CanonicalLoopInfo *Outermost = Loops.front();
4838 CanonicalLoopInfo *Innermost = Loops.back();
4839 BasicBlock *OrigPreheader = Outermost->getPreheader();
4840 BasicBlock *OrigAfter = Outermost->getAfter();
4841 Function *F = OrigPreheader->getParent();
4842
4843 // Loop control blocks that may become orphaned later.
4844 SmallVector<BasicBlock *, 12> OldControlBBs;
4845 OldControlBBs.reserve(6 * Loops.size());
4847 Loop->collectControlBlocks(OldControlBBs);
4848
4849 // Setup the IRBuilder for inserting the trip count computation.
4851 if (ComputeIP.isSet())
4852 Builder.restoreIP(ComputeIP);
4853 else
4854 Builder.restoreIP(Outermost->getPreheaderIP());
4855
4856 // Derive the collapsed' loop trip count.
4857 // TODO: Find common/largest indvar type.
4858 Value *CollapsedTripCount = nullptr;
4859 for (CanonicalLoopInfo *L : Loops) {
4860 assert(L->isValid() &&
4861 "All loops to collapse must be valid canonical loops");
4862 Value *OrigTripCount = L->getTripCount();
4863 if (!CollapsedTripCount) {
4864 CollapsedTripCount = OrigTripCount;
4865 continue;
4866 }
4867
4868 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4869 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4870 {}, /*HasNUW=*/true);
4871 }
4872
4873 // Create the collapsed loop control flow.
4874 CanonicalLoopInfo *Result =
4875 createLoopSkeleton(DL, CollapsedTripCount, F,
4876 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4877
4878 // Build the collapsed loop body code.
4879 // Start with deriving the input loop induction variables from the collapsed
4880 // one, using a divmod scheme. To preserve the original loops' order, the
4881 // innermost loop use the least significant bits.
4882 Builder.restoreIP(Result->getBodyIP());
4883
4884 Value *Leftover = Result->getIndVar();
4885 SmallVector<Value *> NewIndVars;
4886 NewIndVars.resize(NumLoops);
4887 for (int i = NumLoops - 1; i >= 1; --i) {
4888 Value *OrigTripCount = Loops[i]->getTripCount();
4889
4890 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4891 NewIndVars[i] = NewIndVar;
4892
4893 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4894 }
4895 // Outermost loop gets all the remaining bits.
4896 NewIndVars[0] = Leftover;
4897
4898 // Construct the loop body control flow.
4899 // We progressively construct the branch structure following in direction of
4900 // the control flow, from the leading in-between code, the loop nest body, the
4901 // trailing in-between code, and rejoining the collapsed loop's latch.
4902 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4903 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4904 // its predecessors as sources.
4905 BasicBlock *ContinueBlock = Result->getBody();
4906 BasicBlock *ContinuePred = nullptr;
4907 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4908 BasicBlock *NextSrc) {
4909 if (ContinueBlock)
4910 redirectTo(ContinueBlock, Dest, DL);
4911 else
4912 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4913
4914 ContinueBlock = nullptr;
4915 ContinuePred = NextSrc;
4916 };
4917
4918 // The code before the nested loop of each level.
4919 // Because we are sinking it into the nest, it will be executed more often
4920 // that the original loop. More sophisticated schemes could keep track of what
4921 // the in-between code is and instantiate it only once per thread.
4922 for (size_t i = 0; i < NumLoops - 1; ++i)
4923 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4924
4925 // Connect the loop nest body.
4926 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4927
4928 // The code after the nested loop at each level.
4929 for (size_t i = NumLoops - 1; i > 0; --i)
4930 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4931
4932 // Connect the finished loop to the collapsed loop latch.
4933 ContinueWith(Result->getLatch(), nullptr);
4934
4935 // Replace the input loops with the new collapsed loop.
4936 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4937 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4938
4939 // Replace the input loop indvars with the derived ones.
4940 for (size_t i = 0; i < NumLoops; ++i)
4941 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4942
4943 // Remove unused parts of the input loops.
4944 removeUnusedBlocksFromParent(OldControlBBs);
4945
4946 for (CanonicalLoopInfo *L : Loops)
4947 L->invalidate();
4948
4949#ifndef NDEBUG
4950 Result->assertOK();
4951#endif
4952 return Result;
4953}
4954
4955std::vector<CanonicalLoopInfo *>
4957 ArrayRef<Value *> TileSizes) {
4958 assert(TileSizes.size() == Loops.size() &&
4959 "Must pass as many tile sizes as there are loops");
4960 int NumLoops = Loops.size();
4961 assert(NumLoops >= 1 && "At least one loop to tile required");
4962
4963 CanonicalLoopInfo *OutermostLoop = Loops.front();
4964 CanonicalLoopInfo *InnermostLoop = Loops.back();
4965 Function *F = OutermostLoop->getBody()->getParent();
4966 BasicBlock *InnerEnter = InnermostLoop->getBody();
4967 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4968
4969 // Loop control blocks that may become orphaned later.
4970 SmallVector<BasicBlock *, 12> OldControlBBs;
4971 OldControlBBs.reserve(6 * Loops.size());
4973 Loop->collectControlBlocks(OldControlBBs);
4974
4975 // Collect original trip counts and induction variable to be accessible by
4976 // index. Also, the structure of the original loops is not preserved during
4977 // the construction of the tiled loops, so do it before we scavenge the BBs of
4978 // any original CanonicalLoopInfo.
4979 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4980 for (CanonicalLoopInfo *L : Loops) {
4981 assert(L->isValid() && "All input loops must be valid canonical loops");
4982 OrigTripCounts.push_back(L->getTripCount());
4983 OrigIndVars.push_back(L->getIndVar());
4984 }
4985
4986 // Collect the code between loop headers. These may contain SSA definitions
4987 // that are used in the loop nest body. To be usable with in the innermost
4988 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4989 // these instructions may be executed more often than before the tiling.
4990 // TODO: It would be sufficient to only sink them into body of the
4991 // corresponding tile loop.
4993 for (int i = 0; i < NumLoops - 1; ++i) {
4994 CanonicalLoopInfo *Surrounding = Loops[i];
4995 CanonicalLoopInfo *Nested = Loops[i + 1];
4996
4997 BasicBlock *EnterBB = Surrounding->getBody();
4998 BasicBlock *ExitBB = Nested->getHeader();
4999 InbetweenCode.emplace_back(EnterBB, ExitBB);
5000 }
5001
5002 // Compute the trip counts of the floor loops.
5004 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5005 SmallVector<Value *, 4> FloorCount, FloorRems;
5006 for (int i = 0; i < NumLoops; ++i) {
5007 Value *TileSize = TileSizes[i];
5008 Value *OrigTripCount = OrigTripCounts[i];
5009 Type *IVType = OrigTripCount->getType();
5010
5011 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5012 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5013
5014 // 0 if tripcount divides the tilesize, 1 otherwise.
5015 // 1 means we need an additional iteration for a partial tile.
5016 //
5017 // Unfortunately we cannot just use the roundup-formula
5018 // (tripcount + tilesize - 1)/tilesize
5019 // because the summation might overflow. We do not want introduce undefined
5020 // behavior when the untiled loop nest did not.
5021 Value *FloorTripOverflow =
5022 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5023
5024 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5025 FloorTripCount =
5026 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5027 "omp_floor" + Twine(i) + ".tripcount", true);
5028
5029 // Remember some values for later use.
5030 FloorCount.push_back(FloorTripCount);
5031 FloorRems.push_back(FloorTripRem);
5032 }
5033
5034 // Generate the new loop nest, from the outermost to the innermost.
5035 std::vector<CanonicalLoopInfo *> Result;
5036 Result.reserve(NumLoops * 2);
5037
5038 // The basic block of the surrounding loop that enters the nest generated
5039 // loop.
5040 BasicBlock *Enter = OutermostLoop->getPreheader();
5041
5042 // The basic block of the surrounding loop where the inner code should
5043 // continue.
5044 BasicBlock *Continue = OutermostLoop->getAfter();
5045
5046 // Where the next loop basic block should be inserted.
5047 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5048
5049 auto EmbeddNewLoop =
5050 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5051 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5052 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5053 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5054 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5055 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5056
5057 // Setup the position where the next embedded loop connects to this loop.
5058 Enter = EmbeddedLoop->getBody();
5059 Continue = EmbeddedLoop->getLatch();
5060 OutroInsertBefore = EmbeddedLoop->getLatch();
5061 return EmbeddedLoop;
5062 };
5063
5064 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5065 const Twine &NameBase) {
5066 for (auto P : enumerate(TripCounts)) {
5067 CanonicalLoopInfo *EmbeddedLoop =
5068 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5069 Result.push_back(EmbeddedLoop);
5070 }
5071 };
5072
5073 EmbeddNewLoops(FloorCount, "floor");
5074
5075 // Within the innermost floor loop, emit the code that computes the tile
5076 // sizes.
5078 SmallVector<Value *, 4> TileCounts;
5079 for (int i = 0; i < NumLoops; ++i) {
5080 CanonicalLoopInfo *FloorLoop = Result[i];
5081 Value *TileSize = TileSizes[i];
5082
5083 Value *FloorIsEpilogue =
5084 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5085 Value *TileTripCount =
5086 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5087
5088 TileCounts.push_back(TileTripCount);
5089 }
5090
5091 // Create the tile loops.
5092 EmbeddNewLoops(TileCounts, "tile");
5093
5094 // Insert the inbetween code into the body.
5095 BasicBlock *BodyEnter = Enter;
5096 BasicBlock *BodyEntered = nullptr;
5097 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5098 BasicBlock *EnterBB = P.first;
5099 BasicBlock *ExitBB = P.second;
5100
5101 if (BodyEnter)
5102 redirectTo(BodyEnter, EnterBB, DL);
5103 else
5104 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5105
5106 BodyEnter = nullptr;
5107 BodyEntered = ExitBB;
5108 }
5109
5110 // Append the original loop nest body into the generated loop nest body.
5111 if (BodyEnter)
5112 redirectTo(BodyEnter, InnerEnter, DL);
5113 else
5114 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5116
5117 // Replace the original induction variable with an induction variable computed
5118 // from the tile and floor induction variables.
5119 Builder.restoreIP(Result.back()->getBodyIP());
5120 for (int i = 0; i < NumLoops; ++i) {
5121 CanonicalLoopInfo *FloorLoop = Result[i];
5122 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5123 Value *OrigIndVar = OrigIndVars[i];
5124 Value *Size = TileSizes[i];
5125
5126 Value *Scale =
5127 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5128 Value *Shift =
5129 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5130 OrigIndVar->replaceAllUsesWith(Shift);
5131 }
5132
5133 // Remove unused parts of the original loops.
5134 removeUnusedBlocksFromParent(OldControlBBs);
5135
5136 for (CanonicalLoopInfo *L : Loops)
5137 L->invalidate();
5138
5139#ifndef NDEBUG
5140 for (CanonicalLoopInfo *GenL : Result)
5141 GenL->assertOK();
5142#endif
5143 return Result;
5144}
5145
5146/// Attach metadata \p Properties to the basic block described by \p BB. If the
5147/// basic block already has metadata, the basic block properties are appended.
5149 ArrayRef<Metadata *> Properties) {
5150 // Nothing to do if no property to attach.
5151 if (Properties.empty())
5152 return;
5153
5154 LLVMContext &Ctx = BB->getContext();
5155 SmallVector<Metadata *> NewProperties;
5156 NewProperties.push_back(nullptr);
5157
5158 // If the basic block already has metadata, prepend it to the new metadata.
5159 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5160 if (Existing)
5161 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5162
5163 append_range(NewProperties, Properties);
5164 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5165 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5166
5167 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5168}
5169
5170/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5171/// loop already has metadata, the loop properties are appended.
5173 ArrayRef<Metadata *> Properties) {
5174 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5175
5176 // Attach metadata to the loop's latch
5177 BasicBlock *Latch = Loop->getLatch();
5178 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5179 addBasicBlockMetadata(Latch, Properties);
5180}
5181
5182/// Attach llvm.access.group metadata to the memref instructions of \p Block
5183static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5184 LoopInfo &LI) {
5185 for (Instruction &I : *Block) {
5186 if (I.mayReadOrWriteMemory()) {
5187 // TODO: This instruction may already have access group from
5188 // other pragmas e.g. #pragma clang loop vectorize. Append
5189 // so that the existing metadata is not overwritten.
5190 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5191 }
5192 }
5193}
5194
5198 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5199 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5200}
5201
5205 Loop, {
5206 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5207 });
5208}
5209
5210void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5211 Value *IfCond, ValueToValueMapTy &VMap,
5212 const Twine &NamePrefix) {
5213 Function *F = CanonicalLoop->getFunction();
5214
5215 // Define where if branch should be inserted
5216 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5217
5218 // TODO: We should not rely on pass manager. Currently we use pass manager
5219 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5220 // object. We should have a method which returns all blocks between
5221 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5223 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5224 FAM.registerPass([]() { return LoopAnalysis(); });
5225 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5226
5227 // Get the loop which needs to be cloned
5228 LoopAnalysis LIA;
5229 LoopInfo &&LI = LIA.run(*F, FAM);
5230 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5231
5232 // Create additional blocks for the if statement
5233 BasicBlock *Head = SplitBefore->getParent();
5234 Instruction *HeadOldTerm = Head->getTerminator();
5235 llvm::LLVMContext &C = Head->getContext();
5237 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5239 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5240
5241 // Create if condition branch.
5242 Builder.SetInsertPoint(HeadOldTerm);
5243 Instruction *BrInstr =
5244 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5245 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5246 // Then block contains branch to omp loop which needs to be vectorized
5247 spliceBB(IP, ThenBlock, false);
5248 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5249
5250 Builder.SetInsertPoint(ElseBlock);
5251
5252 // Clone loop for the else branch
5254
5255 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5256 for (BasicBlock *Block : L->getBlocks()) {
5257 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5258 NewBB->moveBefore(CanonicalLoop->getExit());
5259 VMap[Block] = NewBB;
5260 NewBlocks.push_back(NewBB);
5261 }
5262 remapInstructionsInBlocks(NewBlocks, VMap);
5263 Builder.CreateBr(NewBlocks.front());
5264}
5265
5266unsigned
5268 const StringMap<bool> &Features) {
5269 if (TargetTriple.isX86()) {
5270 if (Features.lookup("avx512f"))
5271 return 512;
5272 else if (Features.lookup("avx"))
5273 return 256;
5274 return 128;
5275 }
5276 if (TargetTriple.isPPC())
5277 return 128;
5278 if (TargetTriple.isWasm())
5279 return 128;
5280 return 0;
5281}
5282
5284 MapVector<Value *, Value *> AlignedVars,
5285 Value *IfCond, OrderKind Order,
5286 ConstantInt *Simdlen, ConstantInt *Safelen) {
5288
5289 Function *F = CanonicalLoop->getFunction();
5290
5291 // TODO: We should not rely on pass manager. Currently we use pass manager
5292 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5293 // object. We should have a method which returns all blocks between
5294 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5296 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5297 FAM.registerPass([]() { return LoopAnalysis(); });
5298 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5299
5300 LoopAnalysis LIA;
5301 LoopInfo &&LI = LIA.run(*F, FAM);
5302
5303 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5304 if (AlignedVars.size()) {
5306 for (auto &AlignedItem : AlignedVars) {
5307 Value *AlignedPtr = AlignedItem.first;
5308 Value *Alignment = AlignedItem.second;
5309 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5310 Builder.SetInsertPoint(loadInst->getNextNode());
5311 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5312 AlignedPtr, Alignment);
5313 }
5314 Builder.restoreIP(IP);
5315 }
5316
5317 if (IfCond) {
5318 ValueToValueMapTy VMap;
5319 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5320 // Add metadata to the cloned loop which disables vectorization
5321 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5322 assert(MappedLatch &&
5323 "Cannot find value which corresponds to original loop latch");
5324 assert(isa<BasicBlock>(MappedLatch) &&
5325 "Cannot cast mapped latch block value to BasicBlock");
5326 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5327 ConstantAsMetadata *BoolConst =
5330 NewLatchBlock,
5331 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5332 BoolConst})});
5333 }
5334
5335 SmallSet<BasicBlock *, 8> Reachable;
5336
5337 // Get the basic blocks from the loop in which memref instructions
5338 // can be found.
5339 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5340 // preferably without running any passes.
5341 for (BasicBlock *Block : L->getBlocks()) {
5342 if (Block == CanonicalLoop->getCond() ||
5343 Block == CanonicalLoop->getHeader())
5344 continue;
5345 Reachable.insert(Block);
5346 }
5347
5348 SmallVector<Metadata *> LoopMDList;
5349
5350 // In presence of finite 'safelen', it may be unsafe to mark all
5351 // the memory instructions parallel, because loop-carried
5352 // dependences of 'safelen' iterations are possible.
5353 // If clause order(concurrent) is specified then the memory instructions
5354 // are marked parallel even if 'safelen' is finite.
5355 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5356 // Add access group metadata to memory-access instructions.
5357 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5358 for (BasicBlock *BB : Reachable)
5359 addSimdMetadata(BB, AccessGroup, LI);
5360 // TODO: If the loop has existing parallel access metadata, have
5361 // to combine two lists.
5362 LoopMDList.push_back(MDNode::get(
5363 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5364 }
5365
5366 // Use the above access group metadata to create loop level
5367 // metadata, which should be distinct for each loop.
5368 ConstantAsMetadata *BoolConst =
5370 LoopMDList.push_back(MDNode::get(
5371 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5372
5373 if (Simdlen || Safelen) {
5374 // If both simdlen and safelen clauses are specified, the value of the
5375 // simdlen parameter must be less than or equal to the value of the safelen
5376 // parameter. Therefore, use safelen only in the absence of simdlen.
5377 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5378 LoopMDList.push_back(
5379 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5380 ConstantAsMetadata::get(VectorizeWidth)}));
5381 }
5382
5383 addLoopMetadata(CanonicalLoop, LoopMDList);
5384}
5385
5386/// Create the TargetMachine object to query the backend for optimization
5387/// preferences.
5388///
5389/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5390/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5391/// needed for the LLVM pass pipline. We use some default options to avoid
5392/// having to pass too many settings from the frontend that probably do not
5393/// matter.
5394///
5395/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5396/// method. If we are going to use TargetMachine for more purposes, especially
5397/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5398/// might become be worth requiring front-ends to pass on their TargetMachine,
5399/// or at least cache it between methods. Note that while fontends such as Clang
5400/// have just a single main TargetMachine per translation unit, "target-cpu" and
5401/// "target-features" that determine the TargetMachine are per-function and can
5402/// be overrided using __attribute__((target("OPTIONS"))).
5403static std::unique_ptr<TargetMachine>
5405 Module *M = F->getParent();
5406
5407 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5408 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5409 const std::string &Triple = M->getTargetTriple();
5410
5411 std::string Error;
5413 if (!TheTarget)
5414 return {};
5415
5417 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5418 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5419 /*CodeModel=*/std::nullopt, OptLevel));
5420}
5421
5422/// Heuristically determine the best-performant unroll factor for \p CLI. This
5423/// depends on the target processor. We are re-using the same heuristics as the
5424/// LoopUnrollPass.
5426 Function *F = CLI->getFunction();
5427
5428 // Assume the user requests the most aggressive unrolling, even if the rest of
5429 // the code is optimized using a lower setting.
5431 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5432
5434 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5435 FAM.registerPass([]() { return AssumptionAnalysis(); });
5436 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5437 FAM.registerPass([]() { return LoopAnalysis(); });
5438 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5439 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5440 TargetIRAnalysis TIRA;
5441 if (TM)
5442 TIRA = TargetIRAnalysis(
5443 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5444 FAM.registerPass([&]() { return TIRA; });
5445
5446 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5448 ScalarEvolution &&SE = SEA.run(*F, FAM);
5450 DominatorTree &&DT = DTA.run(*F, FAM);
5451 LoopAnalysis LIA;
5452 LoopInfo &&LI = LIA.run(*F, FAM);
5454 AssumptionCache &&AC = ACT.run(*F, FAM);
5456
5457 Loop *L = LI.getLoopFor(CLI->getHeader());
5458 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5459
5462 /*BlockFrequencyInfo=*/nullptr,
5463 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5464 /*UserThreshold=*/std::nullopt,
5465 /*UserCount=*/std::nullopt,
5466 /*UserAllowPartial=*/true,
5467 /*UserAllowRuntime=*/true,
5468 /*UserUpperBound=*/std::nullopt,
5469 /*UserFullUnrollMaxCount=*/std::nullopt);
5470
5471 UP.Force = true;
5472
5473 // Account for additional optimizations taking place before the LoopUnrollPass
5474 // would unroll the loop.
5477
5478 // Use normal unroll factors even if the rest of the code is optimized for
5479 // size.
5482
5483 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5484 << " Threshold=" << UP.Threshold << "\n"
5485 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5486 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5487 << " PartialOptSizeThreshold="
5488 << UP.PartialOptSizeThreshold << "\n");
5489
5490 // Disable peeling.
5493 /*UserAllowPeeling=*/false,
5494 /*UserAllowProfileBasedPeeling=*/false,
5495 /*UnrollingSpecficValues=*/false);
5496
5498 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5499
5500 // Assume that reads and writes to stack variables can be eliminated by
5501 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5502 // size.
5503 for (BasicBlock *BB : L->blocks()) {
5504 for (Instruction &I : *BB) {
5505 Value *Ptr;
5506 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5507 Ptr = Load->getPointerOperand();
5508 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5509 Ptr = Store->getPointerOperand();
5510 } else
5511 continue;
5512
5513 Ptr = Ptr->stripPointerCasts();
5514
5515 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5516 if (Alloca->getParent() == &F->getEntryBlock())
5517 EphValues.insert(&I);
5518 }
5519 }
5520 }
5521
5522 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5523
5524 // Loop is not unrollable if the loop contains certain instructions.
5525 if (!UCE.canUnroll()) {
5526 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5527 return 1;
5528 }
5529
5530 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5531 << "\n");
5532
5533 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5534 // be able to use it.
5535 int TripCount = 0;
5536 int MaxTripCount = 0;
5537 bool MaxOrZero = false;
5538 unsigned TripMultiple = 0;
5539
5540 bool UseUpperBound = false;
5541 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5542 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5543 UseUpperBound);
5544 unsigned Factor = UP.Count;
5545 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5546
5547 // This function returns 1 to signal to not unroll a loop.
5548 if (Factor == 0)
5549 return 1;
5550 return Factor;
5551}
5552
5554 int32_t Factor,
5555 CanonicalLoopInfo **UnrolledCLI) {
5556 assert(Factor >= 0 && "Unroll factor must not be negative");
5557
5558 Function *F = Loop->getFunction();
5559 LLVMContext &Ctx = F->getContext();
5560
5561 // If the unrolled loop is not used for another loop-associated directive, it
5562 // is sufficient to add metadata for the LoopUnrollPass.
5563 if (!UnrolledCLI) {
5564 SmallVector<Metadata *, 2> LoopMetadata;
5565 LoopMetadata.push_back(
5566 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5567
5568 if (Factor >= 1) {
5570 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5571 LoopMetadata.push_back(MDNode::get(
5572 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5573 }
5574
5575 addLoopMetadata(Loop, LoopMetadata);
5576 return;
5577 }
5578
5579 // Heuristically determine the unroll factor.
5580 if (Factor == 0)
5582
5583 // No change required with unroll factor 1.
5584 if (Factor == 1) {
5585 *UnrolledCLI = Loop;
5586 return;
5587 }
5588
5589 assert(Factor >= 2 &&
5590 "unrolling only makes sense with a factor of 2 or larger");
5591
5592 Type *IndVarTy = Loop->getIndVarType();
5593
5594 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5595 // unroll the inner loop.
5596 Value *FactorVal =
5597 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5598 /*isSigned=*/false));
5599 std::vector<CanonicalLoopInfo *> LoopNest =
5600 tileLoops(DL, {Loop}, {FactorVal});
5601 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5602 *UnrolledCLI = LoopNest[0];
5603 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5604
5605 // LoopUnrollPass can only fully unroll loops with constant trip count.
5606 // Unroll by the unroll factor with a fallback epilog for the remainder
5607 // iterations if necessary.
5609 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5611 InnerLoop,
5612 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5614 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5615
5616#ifndef NDEBUG
5617 (*UnrolledCLI)->assertOK();
5618#endif
5619}
5620
5623 llvm::Value *BufSize, llvm::Value *CpyBuf,
5624 llvm::Value *CpyFn, llvm::Value *DidIt) {
5625 if (!updateToLocation(Loc))
5626 return Loc.IP;
5627
5628 uint32_t SrcLocStrSize;
5629 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5630 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5631 Value *ThreadId = getOrCreateThreadID(Ident);
5632
5633 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5634
5635 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5636
5637 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5638 Builder.CreateCall(Fn, Args);
5639
5640 return Builder.saveIP();
5641}
5642
5644 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5645 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5647
5648 if (!updateToLocation(Loc))
5649 return Loc.IP;
5650
5651 // If needed allocate and initialize `DidIt` with 0.
5652 // DidIt: flag variable: 1=single thread; 0=not single thread.
5653 llvm::Value *DidIt = nullptr;
5654 if (!CPVars.empty()) {
5657 }
5658
5659 Directive OMPD = Directive::OMPD_single;
5660 uint32_t SrcLocStrSize;
5661 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5662 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5663 Value *ThreadId = getOrCreateThreadID(Ident);
5664 Value *Args[] = {Ident, ThreadId};
5665
5666 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5667 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5668
5669 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5670 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5671
5672 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5673 if (Error Err = FiniCB(IP))
5674 return Err;
5675
5676 // The thread that executes the single region must set `DidIt` to 1.
5677 // This is used by __kmpc_copyprivate, to know if the caller is the
5678 // single thread or not.
5679 if (DidIt)
5681
5682 return Error::success();
5683 };
5684
5685 // generates the following:
5686 // if (__kmpc_single()) {
5687 // .... single region ...
5688 // __kmpc_end_single
5689 // }
5690 // __kmpc_copyprivate
5691 // __kmpc_barrier
5692
5693 InsertPointOrErrorTy AfterIP =
5694 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5695 /*Conditional*/ true,
5696 /*hasFinalize*/ true);
5697 if (!AfterIP)
5698 return AfterIP.takeError();
5699
5700 if (DidIt) {
5701 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5702 // NOTE BufSize is currently unused, so just pass 0.
5704 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5705 CPFuncs[I], DidIt);
5706 // NOTE __kmpc_copyprivate already inserts a barrier
5707 } else if (!IsNowait) {
5708 InsertPointOrErrorTy AfterIP =
5710 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5711 /* CheckCancelFlag */ false);
5712 if (!AfterIP)
5713 return AfterIP.takeError();
5714 }
5715 return Builder.saveIP();
5716}
5717
5719 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5720 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5721
5722 if (!updateToLocation(Loc))
5723 return Loc.IP;
5724
5725 Directive OMPD = Directive::OMPD_critical;
5726 uint32_t SrcLocStrSize;
5727 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5728 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5729 Value *ThreadId = getOrCreateThreadID(Ident);
5730 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5731 Value *Args[] = {Ident, ThreadId, LockVar};
5732
5733 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5734 Function *RTFn = nullptr;
5735 if (HintInst) {
5736 // Add Hint to entry Args and create call
5737 EnterArgs.push_back(HintInst);
5738 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5739 } else {
5740 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5741 }
5742 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5743
5744 Function *ExitRTLFn =
5745 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5746 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5747
5748 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5749 /*Conditional*/ false, /*hasFinalize*/ true);
5750}
5751
5754 InsertPointTy AllocaIP, unsigned NumLoops,
5755 ArrayRef<llvm::Value *> StoreValues,
5756 const Twine &Name, bool IsDependSource) {
5757 assert(
5758 llvm::all_of(StoreValues,
5759 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5760 "OpenMP runtime requires depend vec with i64 type");
5761
5762 if (!updateToLocation(Loc))
5763 return Loc.IP;
5764
5765 // Allocate space for vector and generate alloc instruction.
5766 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5767 Builder.restoreIP(AllocaIP);
5768 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5769 ArgsBase->setAlignment(Align(8));
5770 Builder.restoreIP(Loc.IP);
5771
5772 // Store the index value with offset in depend vector.
5773 for (unsigned I = 0; I < NumLoops; ++I) {
5774 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5775 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5776 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5777 STInst->setAlignment(Align(8));
5778 }
5779
5780 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5781 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5782
5783 uint32_t SrcLocStrSize;
5784 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5785 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5786 Value *ThreadId = getOrCreateThreadID(Ident);
5787 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5788
5789 Function *RTLFn = nullptr;
5790 if (IsDependSource)
5791 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5792 else
5793 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5794 Builder.CreateCall(RTLFn, Args);
5795
5796 return Builder.saveIP();
5797}
5798
5800 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5801 FinalizeCallbackTy FiniCB, bool IsThreads) {
5802 if (!updateToLocation(Loc))
5803 return Loc.IP;
5804
5805 Directive OMPD = Directive::OMPD_ordered;
5806 Instruction *EntryCall = nullptr;
5807 Instruction *ExitCall = nullptr;
5808
5809 if (IsThreads) {
5810 uint32_t SrcLocStrSize;
5811 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5812 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5813 Value *ThreadId = getOrCreateThreadID(Ident);
5814 Value *Args[] = {Ident, ThreadId};
5815
5816 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5817 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5818
5819 Function *ExitRTLFn =
5820 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5821 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5822 }
5823
5824 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5825 /*Conditional*/ false, /*hasFinalize*/ true);
5826}
5827
5828OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5829 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5830 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5831 bool HasFinalize, bool IsCancellable) {
5832
5833 if (HasFinalize)
5834 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5835
5836 // Create inlined region's entry and body blocks, in preparation
5837 // for conditional creation
5838 BasicBlock *EntryBB = Builder.GetInsertBlock();
5839 Instruction *SplitPos = EntryBB->getTerminator();
5840 if (!isa_and_nonnull<BranchInst>(SplitPos))
5841 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5842 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5843 BasicBlock *FiniBB =
5844 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5845
5847 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5848
5849 // generate body
5850 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5851 /* CodeGenIP */ Builder.saveIP()))
5852 return Err;
5853
5854 // emit exit call and do any needed finalization.
5855 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5856 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5857 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5858 "Unexpected control flow graph state!!");
5859 InsertPointOrErrorTy AfterIP =
5860 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5861 if (!AfterIP)
5862 return AfterIP.takeError();
5863 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5864 "Unexpected Control Flow State!");
5866
5867 // If we are skipping the region of a non conditional, remove the exit
5868 // block, and clear the builder's insertion point.
5869 assert(SplitPos->getParent() == ExitBB &&
5870 "Unexpected Insertion point location!");
5871 auto merged = MergeBlockIntoPredecessor(ExitBB);
5872 BasicBlock *ExitPredBB = SplitPos->getParent();
5873 auto InsertBB = merged ? ExitPredBB : ExitBB;
5874 if (!isa_and_nonnull<BranchInst>(SplitPos))
5875 SplitPos->eraseFromParent();
5876 Builder.SetInsertPoint(InsertBB);
5877
5878 return Builder.saveIP();
5879}
5880
5881OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5882 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5883 // if nothing to do, Return current insertion point.
5884 if (!Conditional || !EntryCall)
5885 return Builder.saveIP();
5886
5887 BasicBlock *EntryBB = Builder.GetInsertBlock();
5888 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5889 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5890 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5891
5892 // Emit thenBB and set the Builder's insertion point there for
5893 // body generation next. Place the block after the current block.
5894 Function *CurFn = EntryBB->getParent();
5895 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5896
5897 // Move Entry branch to end of ThenBB, and replace with conditional
5898 // branch (If-stmt)
5899 Instruction *EntryBBTI = EntryBB->getTerminator();
5900 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5901 EntryBBTI->removeFromParent();
5903 Builder.Insert(EntryBBTI);
5904 UI->eraseFromParent();
5906
5907 // return an insertion point to ExitBB.
5908 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5909}
5910
5911OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5912 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5913 bool HasFinalize) {
5914
5915 Builder.restoreIP(FinIP);
5916
5917 // If there is finalization to do, emit it before the exit call
5918 if (HasFinalize) {
5919 assert(!FinalizationStack.empty() &&
5920 "Unexpected finalization stack state!");
5921
5922 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5923 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5924
5925 if (Error Err = Fi.FiniCB(FinIP))
5926 return Err;
5927
5928 BasicBlock *FiniBB = FinIP.getBlock();
5929 Instruction *FiniBBTI = FiniBB->getTerminator();
5930
5931 // set Builder IP for call creation
5932 Builder.SetInsertPoint(FiniBBTI);
5933 }
5934
5935 if (!ExitCall)
5936 return Builder.saveIP();
5937
5938 // place the Exitcall as last instruction before Finalization block terminator
5939 ExitCall->removeFromParent();
5940 Builder.Insert(ExitCall);
5941
5942 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5943 ExitCall->getIterator());
5944}
5945
5947 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5948 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5949 if (!IP.isSet())
5950 return IP;
5951
5953
5954 // creates the following CFG structure
5955 // OMP_Entry : (MasterAddr != PrivateAddr)?
5956 // F T
5957 // | \
5958 // | copin.not.master
5959 // | /
5960 // v /
5961 // copyin.not.master.end
5962 // |
5963 // v
5964 // OMP.Entry.Next
5965
5966 BasicBlock *OMP_Entry = IP.getBlock();
5967 Function *CurFn = OMP_Entry->getParent();
5968 BasicBlock *CopyBegin =
5969 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5970 BasicBlock *CopyEnd = nullptr;
5971
5972 // If entry block is terminated, split to preserve the branch to following
5973 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5974 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5975 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5976 "copyin.not.master.end");
5977 OMP_Entry->getTerminator()->eraseFromParent();
5978 } else {
5979 CopyEnd =
5980 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5981 }
5982
5983 Builder.SetInsertPoint(OMP_Entry);
5984 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5985 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5986 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5987 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5988
5989 Builder.SetInsertPoint(CopyBegin);
5990 if (BranchtoEnd)
5992
5993 return Builder.saveIP();
5994}
5995
5997 Value *Size, Value *Allocator,
5998 std::string Name) {
6000 updateToLocation(Loc);
6001
6002 uint32_t SrcLocStrSize;
6003 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6004 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6005 Value *ThreadId = getOrCreateThreadID(Ident);
6006 Value *Args[] = {ThreadId, Size, Allocator};
6007
6008 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6009
6010 return Builder.CreateCall(Fn, Args, Name);
6011}
6012
6014 Value *Addr, Value *Allocator,
6015 std::string Name) {
6017 updateToLocation(Loc);
6018
6019 uint32_t SrcLocStrSize;
6020 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6021 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6022 Value *ThreadId = getOrCreateThreadID(Ident);
6023 Value *Args[] = {ThreadId, Addr, Allocator};
6024 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6025 return Builder.CreateCall(Fn, Args, Name);
6026}
6027
6029 const LocationDescription &Loc, Value *InteropVar,
6030 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6031 Value *DependenceAddress, bool HaveNowaitClause) {
6033 updateToLocation(Loc);
6034
6035 uint32_t SrcLocStrSize;
6036 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6037 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6038 Value *ThreadId = getOrCreateThreadID(Ident);
6039 if (Device == nullptr)
6041 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6042 if (NumDependences == nullptr) {
6043 NumDependences = ConstantInt::get(Int32, 0);
6044 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6045 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6046 }
6047 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6048 Value *Args[] = {
6049 Ident, ThreadId, InteropVar, InteropTypeVal,
6050 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6051
6052 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6053
6054 return Builder.CreateCall(Fn, Args);
6055}
6056
6058 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6059 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6061 updateToLocation(Loc);
6062
6063 uint32_t SrcLocStrSize;
6064 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6065 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6066 Value *ThreadId = getOrCreateThreadID(Ident);
6067 if (Device == nullptr)
6069 if (NumDependences == nullptr) {
6070 NumDependences = ConstantInt::get(Int32, 0);
6071 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6072 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6073 }
6074 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6075 Value *Args[] = {
6076 Ident, ThreadId, InteropVar, Device,
6077 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6078
6079 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6080
6081 return Builder.CreateCall(Fn, Args);
6082}
6083
6085 Value *InteropVar, Value *Device,
6086 Value *NumDependences,
6087 Value *DependenceAddress,
6088 bool HaveNowaitClause) {
6090 updateToLocation(Loc);
6091 uint32_t SrcLocStrSize;
6092 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6093 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6094 Value *ThreadId = getOrCreateThreadID(Ident);
6095 if (Device == nullptr)
6097 if (NumDependences == nullptr) {
6098 NumDependences = ConstantInt::get(Int32, 0);
6099 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6100 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6101 }
6102 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6103 Value *Args[] = {
6104 Ident, ThreadId, InteropVar, Device,
6105 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6106
6107 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6108
6109 return Builder.CreateCall(Fn, Args);
6110}
6111
6113 const LocationDescription &Loc, llvm::Value *Pointer,
6116 updateToLocation(Loc);
6117
6118 uint32_t SrcLocStrSize;
6119 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6120 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6121 Value *ThreadId = getOrCreateThreadID(Ident);
6122 Constant *ThreadPrivateCache =
6123 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6124 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6125
6126 Function *Fn =
6127 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6128
6129 return Builder.CreateCall(Fn, Args);
6130}
6131
6133 const LocationDescription &Loc,
6135 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6136 "expected num_threads and num_teams to be specified");
6137
6138 if (!updateToLocation(Loc))
6139 return Loc.IP;
6140
6141 uint32_t SrcLocStrSize;
6142 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6143 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6144 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6145 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6146 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6147 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6148 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6149
6150 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6151 Function *Kernel = DebugKernelWrapper;
6152
6153 // We need to strip the debug prefix to get the correct kernel name.
6154 StringRef KernelName = Kernel->getName();
6155 const std::string DebugPrefix = "_debug__";
6156 if (KernelName.ends_with(DebugPrefix)) {
6157 KernelName = KernelName.drop_back(DebugPrefix.length());
6158 Kernel = M.getFunction(KernelName);
6159 assert(Kernel && "Expected the real kernel to exist");
6160 }
6161
6162 // Manifest the launch configuration in the metadata matching the kernel
6163 // environment.
6164 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6165 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6166
6167 // If MaxThreads not set, select the maximum between the default workgroup
6168 // size and the MinThreads value.
6169 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6170 if (MaxThreadsVal < 0)
6171 MaxThreadsVal = std::max(
6172 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6173
6174 if (MaxThreadsVal > 0)
6175 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6176
6177 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6179 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6180 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6181 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6182 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6183
6185 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6186 const DataLayout &DL = Fn->getDataLayout();
6187
6188 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6189 Constant *DynamicEnvironmentInitializer =
6190 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6191 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6192 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6193 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6194 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6195 DL.getDefaultGlobalsAddressSpace());
6196 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6197
6198 Constant *DynamicEnvironment =
6199 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6200 ? DynamicEnvironmentGV
6201 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6202 DynamicEnvironmentPtr);
6203
6204 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6205 ConfigurationEnvironment, {
6206 UseGenericStateMachineVal,
6207 MayUseNestedParallelismVal,
6208 IsSPMDVal,
6209 MinThreads,
6210 MaxThreads,
6211 MinTeams,
6212 MaxTeams,
6213 ReductionDataSize,
6214 ReductionBufferLength,
6215 });
6216 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6217 KernelEnvironment, {
6218 ConfigurationEnvironmentInitializer,
6219 Ident,
6220 DynamicEnvironment,
6221 });
6222 std::string KernelEnvironmentName =
6223 (KernelName + "_kernel_environment").str();
6224 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6225 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6226 KernelEnvironmentInitializer, KernelEnvironmentName,
6227 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6228 DL.getDefaultGlobalsAddressSpace());
6229 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6230
6231 Constant *KernelEnvironment =
6232 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6233 ? KernelEnvironmentGV
6234 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6235 KernelEnvironmentPtr);
6236 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6237 CallInst *ThreadKind =
6238 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6239
6240 Value *ExecUserCode = Builder.CreateICmpEQ(
6241 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6242 "exec_user_code");
6243
6244 // ThreadKind = __kmpc_target_init(...)
6245 // if (ThreadKind == -1)
6246 // user_code
6247 // else
6248 // return;
6249
6250 auto *UI = Builder.CreateUnreachable();
6251 BasicBlock *CheckBB = UI->getParent();
6252 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6253
6254 BasicBlock *WorkerExitBB = BasicBlock::Create(
6255 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6256 Builder.SetInsertPoint(WorkerExitBB);
6258
6259 auto *CheckBBTI = CheckBB->getTerminator();
6260 Builder.SetInsertPoint(CheckBBTI);
6261 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6262
6263 CheckBBTI->eraseFromParent();
6264 UI->eraseFromParent();
6265
6266 // Continue in the "user_code" block, see diagram above and in
6267 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6268 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6269}
6270
6272 int32_t TeamsReductionDataSize,
6273 int32_t TeamsReductionBufferLength) {
6274 if (!updateToLocation(Loc))
6275 return;
6276
6278 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6279
6280 Builder.CreateCall(Fn, {});
6281
6282 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6283 return;
6284
6286 // We need to strip the debug prefix to get the correct kernel name.
6287 StringRef KernelName = Kernel->getName();
6288 const std::string DebugPrefix = "_debug__";
6289 if (KernelName.ends_with(DebugPrefix))
6290 KernelName = KernelName.drop_back(DebugPrefix.length());
6291 auto *KernelEnvironmentGV =
6292 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6293 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6294 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6295 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6296 KernelEnvironmentInitializer,
6297 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6298 NewInitializer = ConstantFoldInsertValueInstruction(
6299 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6300 {0, 8});
6301 KernelEnvironmentGV->setInitializer(NewInitializer);
6302}
6303
6305 Module &M = *Kernel.getParent();
6306 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6307 for (auto *Op : MD->operands()) {
6308 if (Op->getNumOperands() != 3)
6309 continue;
6310 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6311 if (!KernelOp || KernelOp->getValue() != &Kernel)
6312 continue;
6313 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6314 if (!Prop || Prop->getString() != Name)
6315 continue;
6316 return Op;
6317 }
6318 return nullptr;
6319}
6320
6322 bool Min) {
6323 // Update the "maxntidx" metadata for NVIDIA, or add it.
6324 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6325 if (ExistingOp) {
6326 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6327 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6328 ExistingOp->replaceOperandWith(
6329 2, ConstantAsMetadata::get(ConstantInt::get(
6330 OldVal->getValue()->getType(),
6331 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6332 } else {
6333 LLVMContext &Ctx = Kernel.getContext();
6335 MDString::get(Ctx, Name),
6337 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6338 // Append metadata to nvvm.annotations
6339 Module &M = *Kernel.getParent();
6340 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6341 MD->addOperand(MDNode::get(Ctx, MDVals));
6342 }
6343}
6344
6345std::pair<int32_t, int32_t>
6347 int32_t ThreadLimit =
6348 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6349
6350 if (T.isAMDGPU()) {
6351 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6352 if (!Attr.isValid() || !Attr.isStringAttribute())
6353 return {0, ThreadLimit};
6354 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6355 int32_t LB, UB;
6356 if (!llvm::to_integer(UBStr, UB, 10))
6357 return {0, ThreadLimit};
6358 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6359 if (!llvm::to_integer(LBStr, LB, 10))
6360 return {0, UB};
6361 return {LB, UB};
6362 }
6363
6364 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6365 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6366 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6367 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6368 }
6369 return {0, ThreadLimit};
6370}
6371
6373 Function &Kernel, int32_t LB,
6374 int32_t UB) {
6375 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6376
6377 if (T.isAMDGPU()) {
6378 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6379 llvm::utostr(LB) + "," + llvm::utostr(UB));
6380 return;
6381 }
6382
6383 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6384}
6385
6386std::pair<int32_t, int32_t>
6388 // TODO: Read from backend annotations if available.
6389 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6390}
6391
6393 int32_t LB, int32_t UB) {
6394 if (T.isNVPTX())
6395 if (UB > 0)
6396 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6397 if (T.isAMDGPU())
6398 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6399
6400 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6401}
6402
6403void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6404 Function *OutlinedFn) {
6405 if (Config.isTargetDevice()) {
6407 // TODO: Determine if DSO local can be set to true.
6408 OutlinedFn->setDSOLocal(false);
6410 if (T.isAMDGCN())
6412 }
6413}
6414
6415Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6416 StringRef EntryFnIDName) {
6417 if (Config.isTargetDevice()) {
6418 assert(OutlinedFn && "The outlined function must exist if embedded");
6419 return OutlinedFn;
6420 }
6421
6422 return new GlobalVariable(
6423 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6424 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6425}
6426
6427Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6428 StringRef EntryFnName) {
6429 if (OutlinedFn)
6430 return OutlinedFn;
6431
6432 assert(!M.getGlobalVariable(EntryFnName, true) &&
6433 "Named kernel already exists?");
6434 return new GlobalVariable(
6435 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6436 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6437}
6438
6440 TargetRegionEntryInfo &EntryInfo,
6441 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6442 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6443
6444 SmallString<64> EntryFnName;
6445 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6446
6448 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6449 if (!CBResult)
6450 return CBResult.takeError();
6451 OutlinedFn = *CBResult;
6452 } else {
6453 OutlinedFn = nullptr;
6454 }
6455
6456 // If this target outline function is not an offload entry, we don't need to
6457 // register it. This may be in the case of a false if clause, or if there are
6458 // no OpenMP targets.
6459 if (!IsOffloadEntry)
6460 return Error::success();
6461
6462 std::string EntryFnIDName =
6464 ? std::string(EntryFnName)
6465 : createPlatformSpecificName({EntryFnName, "region_id"});
6466
6467 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6468 EntryFnName, EntryFnIDName);
6469 return Error::success();
6470}
6471
6473 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6474 StringRef EntryFnName, StringRef EntryFnIDName) {
6475 if (OutlinedFn)
6476 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6477 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6478 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6480 EntryInfo, EntryAddr, OutlinedFnID,
6482 return OutlinedFnID;
6483}
6484
6486 const LocationDescription &Loc, InsertPointTy AllocaIP,
6487 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6488 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6489 omp::RuntimeFunction *MapperFunc,
6491 BodyGenTy BodyGenType)>
6492 BodyGenCB,
6493 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6494 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6495 if (!updateToLocation(Loc))
6496 return InsertPointTy();
6497
6498 Builder.restoreIP(CodeGenIP);
6499 // Disable TargetData CodeGen on Device pass.
6500 if (Config.IsTargetDevice.value_or(false)) {
6501 if (BodyGenCB) {
6502 InsertPointOrErrorTy AfterIP =
6503 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6504 if (!AfterIP)
6505 return AfterIP.takeError();
6506 Builder.restoreIP(*AfterIP);
6507 }
6508 return Builder.saveIP();
6509 }
6510
6511 bool IsStandAlone = !BodyGenCB;
6512 MapInfosTy *MapInfo;
6513 // Generate the code for the opening of the data environment. Capture all the
6514 // arguments of the runtime call by reference because they are used in the
6515 // closing of the region.
6516 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6517 InsertPointTy CodeGenIP) -> Error {
6518 MapInfo = &GenMapInfoCB(Builder.saveIP());
6519 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6520 /*IsNonContiguous=*/true, DeviceAddrCB,
6521 CustomMapperCB);
6522
6523 TargetDataRTArgs RTArgs;
6525
6526 // Emit the number of elements in the offloading arrays.
6527 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6528
6529 // Source location for the ident struct
6530 if (!SrcLocInfo) {
6531 uint32_t SrcLocStrSize;
6532 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6533 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6534 }
6535
6536 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6537 SrcLocInfo, DeviceID,
6538 PointerNum, RTArgs.BasePointersArray,
6539 RTArgs.PointersArray, RTArgs.SizesArray,
6540 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6541 RTArgs.MappersArray};
6542
6543 if (IsStandAlone) {
6544 assert(MapperFunc && "MapperFunc missing for standalone target data");
6545
6546 auto TaskBodyCB = [&](Value *, Value *,
6548 if (Info.HasNoWait) {
6549 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6553 }
6554
6556 OffloadingArgs);
6557
6558 if (Info.HasNoWait) {
6559 BasicBlock *OffloadContBlock =
6560 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6562 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6564 }
6565 return Error::success();
6566 };
6567
6568 bool RequiresOuterTargetTask = Info.HasNoWait;
6569 if (!RequiresOuterTargetTask)
6570 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6571 /*TargetTaskAllocaIP=*/{}));
6572 else
6573 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6574 /*Dependencies=*/{}, Info.HasNoWait));
6575 } else {
6576 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6577 omp::OMPRTL___tgt_target_data_begin_mapper);
6578
6579 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6580
6581 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6582 if (isa<AllocaInst>(DeviceMap.second.second)) {
6583 auto *LI =
6584 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6585 Builder.CreateStore(LI, DeviceMap.second.second);
6586 }
6587 }
6588
6589 // If device pointer privatization is required, emit the body of the
6590 // region here. It will have to be duplicated: with and without
6591 // privatization.
6592 InsertPointOrErrorTy AfterIP =
6593 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6594 if (!AfterIP)
6595 return AfterIP.takeError();
6596 Builder.restoreIP(*AfterIP);
6597 }
6598 return Error::success();
6599 };
6600
6601 // If we need device pointer privatization, we need to emit the body of the
6602 // region with no privatization in the 'else' branch of the conditional.
6603 // Otherwise, we don't have to do anything.
6604 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6605 InsertPointTy CodeGenIP) -> Error {
6606 InsertPointOrErrorTy AfterIP =
6607 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6608 if (!AfterIP)
6609 return AfterIP.takeError();
6610 Builder.restoreIP(*AfterIP);
6611 return Error::success();
6612 };
6613
6614 // Generate code for the closing of the data region.
6615 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6616 TargetDataRTArgs RTArgs;
6617 Info.EmitDebug = !MapInfo->Names.empty();
6618 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6619
6620 // Emit the number of elements in the offloading arrays.
6621 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6622
6623 // Source location for the ident struct
6624 if (!SrcLocInfo) {
6625 uint32_t SrcLocStrSize;
6626 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6627 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6628 }
6629
6630 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6631 PointerNum, RTArgs.BasePointersArray,
6632 RTArgs.PointersArray, RTArgs.SizesArray,
6633 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6634 RTArgs.MappersArray};
6635 Function *EndMapperFunc =
6636 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6637
6638 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6639 return Error::success();
6640 };
6641
6642 // We don't have to do anything to close the region if the if clause evaluates
6643 // to false.
6644 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6645 return Error::success();
6646 };
6647
6648 Error Err = [&]() -> Error {
6649 if (BodyGenCB) {
6650 Error Err = [&]() {
6651 if (IfCond)
6652 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6653 return BeginThenGen(AllocaIP, Builder.saveIP());
6654 }();
6655
6656 if (Err)
6657 return Err;
6658
6659 // If we don't require privatization of device pointers, we emit the body
6660 // in between the runtime calls. This avoids duplicating the body code.
6661 InsertPointOrErrorTy AfterIP =
6662 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6663 if (!AfterIP)
6664 return AfterIP.takeError();
6665 Builder.restoreIP(*AfterIP);
6666
6667 if (IfCond)
6668 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6669 return EndThenGen(AllocaIP, Builder.saveIP());
6670 }
6671 if (IfCond)
6672 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6673 return BeginThenGen(AllocaIP, Builder.saveIP());
6674 }();
6675
6676 if (Err)
6677 return Err;
6678
6679 return Builder.saveIP();
6680}
6681
6684 bool IsGPUDistribute) {
6685 assert((IVSize == 32 || IVSize == 64) &&
6686 "IV size is not compatible with the omp runtime");
6688 if (IsGPUDistribute)
6689 Name = IVSize == 32
6690 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6691 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6692 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6693 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6694 else
6695 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6696 : omp::OMPRTL___kmpc_for_static_init_4u)
6697 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6698 : omp::OMPRTL___kmpc_for_static_init_8u);
6699
6701}
6702
6704 bool IVSigned) {
6705 assert((IVSize == 32 || IVSize == 64) &&
6706 "IV size is not compatible with the omp runtime");
6707 RuntimeFunction Name = IVSize == 32
6708 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6709 : omp::OMPRTL___kmpc_dispatch_init_4u)
6710 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6711 : omp::OMPRTL___kmpc_dispatch_init_8u);
6712
6714}
6715
6717 bool IVSigned) {
6718 assert((IVSize == 32 || IVSize == 64) &&
6719 "IV size is not compatible with the omp runtime");
6720 RuntimeFunction Name = IVSize == 32
6721 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6722 : omp::OMPRTL___kmpc_dispatch_next_4u)
6723 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6724 : omp::OMPRTL___kmpc_dispatch_next_8u);
6725
6727}
6728
6730 bool IVSigned) {
6731 assert((IVSize == 32 || IVSize == 64) &&
6732 "IV size is not compatible with the omp runtime");
6733 RuntimeFunction Name = IVSize == 32
6734 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6735 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6736 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6737 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6738
6740}
6741
6743 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6744}
6745
6747 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6749 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
6752 SmallVector<Type *> ParameterTypes;
6753 if (OMPBuilder.Config.isTargetDevice()) {
6754 // Add the "implicit" runtime argument we use to provide launch specific
6755 // information for target devices.
6756 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6757 ParameterTypes.push_back(Int8PtrTy);
6758
6759 // All parameters to target devices are passed as pointers
6760 // or i64. This assumes 64-bit address spaces/pointers.
6761 for (auto &Arg : Inputs)
6762 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6763 ? Arg->getType()
6764 : Type::getInt64Ty(Builder.getContext()));
6765 } else {
6766 for (auto &Arg : Inputs)
6767 ParameterTypes.push_back(Arg->getType());
6768 }
6769
6770 auto BB = Builder.GetInsertBlock();
6771 auto M = BB->getModule();
6772 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6773 /*isVarArg*/ false);
6774 auto Func =
6775 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6776
6777 // Forward target-cpu and target-features function attributes from the
6778 // original function to the new outlined function.
6779 Function *ParentFn = Builder.GetInsertBlock()->getParent();
6780
6781 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
6782 if (TargetCpuAttr.isStringAttribute())
6783 Func->addFnAttr(TargetCpuAttr);
6784
6785 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
6786 if (TargetFeaturesAttr.isStringAttribute())
6787 Func->addFnAttr(TargetFeaturesAttr);
6788
6789 if (OMPBuilder.Config.isTargetDevice()) {
6790 Value *ExecMode =
6791 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
6792 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
6793 }
6794
6795 // Save insert point.
6796 IRBuilder<>::InsertPointGuard IPG(Builder);
6797 // If there's a DISubprogram associated with current function, then
6798 // generate one for the outlined function.
6799 if (Function *ParentFunc = BB->getParent()) {
6800 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6801 DICompileUnit *CU = SP->getUnit();
6802 DIBuilder DB(*M, true, CU);
6804 if (DL) {
6805 // TODO: We are using nullopt for arguments at the moment. This will
6806 // need to be updated when debug data is being generated for variables.
6807 DISubroutineType *Ty =
6808 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6809 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6810 DISubprogram::SPFlagOptimized |
6811 DISubprogram::SPFlagLocalToUnit;
6812
6813 DISubprogram *OutlinedSP = DB.createFunction(
6814 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6815 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6816
6817 // Attach subprogram to the function.
6818 Func->setSubprogram(OutlinedSP);
6819 // Update the CurrentDebugLocation in the builder so that right scope
6820 // is used for things inside outlined function.
6822 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6823 OutlinedSP, DL.getInlinedAt()));
6824 }
6825 }
6826 }
6827
6828 // Generate the region into the function.
6829 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6830 Builder.SetInsertPoint(EntryBB);
6831
6832 // Insert target init call in the device compilation pass.
6833 if (OMPBuilder.Config.isTargetDevice())
6834 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
6835
6836 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6837
6838 // As we embed the user code in the middle of our target region after we
6839 // generate entry code, we must move what allocas we can into the entry
6840 // block to avoid possible breaking optimisations for device
6841 if (OMPBuilder.Config.isTargetDevice())
6843
6844 // Insert target deinit call in the device compilation pass.
6845 BasicBlock *OutlinedBodyBB =
6846 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6848 Builder.saveIP(),
6849 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6850 if (!AfterIP)
6851 return AfterIP.takeError();
6852 Builder.restoreIP(*AfterIP);
6853 if (OMPBuilder.Config.isTargetDevice())
6854 OMPBuilder.createTargetDeinit(Builder);
6855
6856 // Insert return instruction.
6857 Builder.CreateRetVoid();
6858
6859 // New Alloca IP at entry point of created device function.
6860 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6861 auto AllocaIP = Builder.saveIP();
6862
6863 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6864
6865 // Skip the artificial dyn_ptr on the device.
6866 const auto &ArgRange =
6867 OMPBuilder.Config.isTargetDevice()
6868 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6869 : Func->args();
6870
6871 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6872 // Things like GEP's can come in the form of Constants. Constants and
6873 // ConstantExpr's do not have access to the knowledge of what they're
6874 // contained in, so we must dig a little to find an instruction so we
6875 // can tell if they're used inside of the function we're outlining. We
6876 // also replace the original constant expression with a new instruction
6877 // equivalent; an instruction as it allows easy modification in the
6878 // following loop, as we can now know the constant (instruction) is
6879 // owned by our target function and replaceUsesOfWith can now be invoked
6880 // on it (cannot do this with constants it seems). A brand new one also
6881 // allows us to be cautious as it is perhaps possible the old expression
6882 // was used inside of the function but exists and is used externally
6883 // (unlikely by the nature of a Constant, but still).
6884 // NOTE: We cannot remove dead constants that have been rewritten to
6885 // instructions at this stage, we run the risk of breaking later lowering
6886 // by doing so as we could still be in the process of lowering the module
6887 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6888 // constants we have created rewritten versions of.
6889 if (auto *Const = dyn_cast<Constant>(Input))
6890 convertUsersOfConstantsToInstructions(Const, Func, false);
6891
6892 // Collect all the instructions
6893 for (User *User : make_early_inc_range(Input->users()))
6894 if (auto *Instr = dyn_cast<Instruction>(User))
6895 if (Instr->getFunction() == Func)
6896 Instr->replaceUsesOfWith(Input, InputCopy);
6897 };
6898
6899 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6900
6901 // Rewrite uses of input valus to parameters.
6902 for (auto InArg : zip(Inputs, ArgRange)) {
6903 Value *Input = std::get<0>(InArg);
6904 Argument &Arg = std::get<1>(InArg);
6905 Value *InputCopy = nullptr;
6906
6908 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6909 if (!AfterIP)
6910 return AfterIP.takeError();
6911 Builder.restoreIP(*AfterIP);
6912
6913 // In certain cases a Global may be set up for replacement, however, this
6914 // Global may be used in multiple arguments to the kernel, just segmented
6915 // apart, for example, if we have a global array, that is sectioned into
6916 // multiple mappings (technically not legal in OpenMP, but there is a case
6917 // in Fortran for Common Blocks where this is neccesary), we will end up
6918 // with GEP's into this array inside the kernel, that refer to the Global
6919 // but are technically seperate arguments to the kernel for all intents and
6920 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6921 // index, it will fold into an referal to the Global, if we then encounter
6922 // this folded GEP during replacement all of the references to the
6923 // Global in the kernel will be replaced with the argument we have generated
6924 // that corresponds to it, including any other GEP's that refer to the
6925 // Global that may be other arguments. This will invalidate all of the other
6926 // preceding mapped arguments that refer to the same global that may be
6927 // seperate segments. To prevent this, we defer global processing until all
6928 // other processing has been performed.
6929 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6930 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6931 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6932 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6933 continue;
6934 }
6935
6936 ReplaceValue(Input, InputCopy, Func);
6937 }
6938
6939 // Replace all of our deferred Input values, currently just Globals.
6940 for (auto Deferred : DeferredReplacement)
6941 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6942
6943 return Func;
6944}
6945
6946/// Create an entry point for a target task with the following.
6947/// It'll have the following signature
6948/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6949/// This function is called from emitTargetTask once the
6950/// code to launch the target kernel has been outlined already.
6952 IRBuilderBase &Builder,
6953 CallInst *StaleCI) {
6954 Module &M = OMPBuilder.M;
6955 // KernelLaunchFunction is the target launch function, i.e.
6956 // the function that sets up kernel arguments and calls
6957 // __tgt_target_kernel to launch the kernel on the device.
6958 //
6959 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6960
6961 // StaleCI is the CallInst which is the call to the outlined
6962 // target kernel launch function. If there are values that the
6963 // outlined function uses then these are aggregated into a structure
6964 // which is passed as the second argument. If not, then there's
6965 // only one argument, the threadID. So, StaleCI can be
6966 //
6967 // %structArg = alloca { ptr, ptr }, align 8
6968 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6969 // store ptr %20, ptr %gep_, align 8
6970 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6971 // store ptr %21, ptr %gep_8, align 8
6972 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6973 //
6974 // OR
6975 //
6976 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6978 StaleCI->getIterator());
6979 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6980 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6981 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6982 Type *TaskTy = OMPBuilder.Task;
6983 auto ProxyFnTy =
6984 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6985 /* isVarArg */ false);
6986 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6987 ".omp_target_task_proxy_func",
6988 Builder.GetInsertBlock()->getModule());
6989 ProxyFn->getArg(0)->setName("thread.id");
6990 ProxyFn->getArg(1)->setName("task");
6991
6992 BasicBlock *EntryBB =
6993 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6994 Builder.SetInsertPoint(EntryBB);
6995
6996 bool HasShareds = StaleCI->arg_size() > 1;
6997 // TODO: This is a temporary assert to prove to ourselves that
6998 // the outlined target launch function is always going to have
6999 // atmost two arguments if there is any data shared between
7000 // host and device.
7001 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
7002 "StaleCI with shareds should have exactly two arguments.");
7003 if (HasShareds) {
7004 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7005 assert(ArgStructAlloca &&
7006 "Unable to find the alloca instruction corresponding to arguments "
7007 "for extracted function");
7008 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7009
7010 AllocaInst *NewArgStructAlloca =
7011 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7012 Value *TaskT = ProxyFn->getArg(1);
7013 Value *ThreadId = ProxyFn->getArg(0);
7014 Value *SharedsSize =
7015 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7016
7017 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7018 LoadInst *LoadShared =
7019 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7020
7021 Builder.CreateMemCpy(
7022 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7023 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7024
7025 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7026 }
7027 Builder.CreateRetVoid();
7028 return ProxyFn;
7029}
7030
7032 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7033 TargetRegionEntryInfo &EntryInfo,
7035 Function *&OutlinedFn, Constant *&OutlinedFnID,
7039
7040 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7041 [&](StringRef EntryFnName) {
7042 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7043 EntryFnName, Inputs, CBFunc,
7044 ArgAccessorFuncCB);
7045 };
7046
7047 return OMPBuilder.emitTargetRegionFunction(
7048 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7049 OutlinedFnID);
7050}
7051
7053 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7056 bool HasNoWait) {
7057
7058 // The following explains the code-gen scenario for the `target` directive. A
7059 // similar scneario is followed for other device-related directives (e.g.
7060 // `target enter data`) but in similar fashion since we only need to emit task
7061 // that encapsulates the proper runtime call.
7062 //
7063 // When we arrive at this function, the target region itself has been
7064 // outlined into the function OutlinedFn.
7065 // So at ths point, for
7066 // --------------------------------------------------
7067 // void user_code_that_offloads(...) {
7068 // omp target depend(..) map(from:a) map(to:b, c)
7069 // a = b + c
7070 // }
7071 //
7072 // --------------------------------------------------
7073 //
7074 // we have
7075 //
7076 // --------------------------------------------------
7077 //
7078 // void user_code_that_offloads(...) {
7079 // %.offload_baseptrs = alloca [3 x ptr], align 8
7080 // %.offload_ptrs = alloca [3 x ptr], align 8
7081 // %.offload_mappers = alloca [3 x ptr], align 8
7082 // ;; target region has been outlined and now we need to
7083 // ;; offload to it via a target task.
7084 // }
7085 // void outlined_device_function(ptr a, ptr b, ptr c) {
7086 // *a = *b + *c
7087 // }
7088 //
7089 // We have to now do the following
7090 // (i) Make an offloading call to outlined_device_function using the OpenMP
7091 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7092 // emitted by emitKernelLaunch
7093 // (ii) Create a task entry point function that calls kernel_launch_function
7094 // and is the entry point for the target task. See
7095 // '@.omp_target_task_proxy_func in the pseudocode below.
7096 // (iii) Create a task with the task entry point created in (ii)
7097 //
7098 // That is we create the following
7099 //
7100 // void user_code_that_offloads(...) {
7101 // %.offload_baseptrs = alloca [3 x ptr], align 8
7102 // %.offload_ptrs = alloca [3 x ptr], align 8
7103 // %.offload_mappers = alloca [3 x ptr], align 8
7104 //
7105 // %structArg = alloca { ptr, ptr, ptr }, align 8
7106 // %strucArg[0] = %.offload_baseptrs
7107 // %strucArg[1] = %.offload_ptrs
7108 // %strucArg[2] = %.offload_mappers
7109 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7110 // @.omp_target_task_proxy_func)
7111 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7112 // dependencies_array = ...
7113 // ;; if nowait not present
7114 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7115 // call @__kmpc_omp_task_begin_if0(...)
7116 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7117 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7118 // }
7119 //
7120 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7121 // ptr %task) {
7122 // %structArg = alloca {ptr, ptr, ptr}
7123 // %shared_data = load (getelementptr %task, 0, 0)
7124 // mempcy(%structArg, %shared_data, sizeof(structArg))
7125 // kernel_launch_function(%thread.id, %structArg)
7126 // }
7127 //
7128 // We need the proxy function because the signature of the task entry point
7129 // expected by kmpc_omp_task is always the same and will be different from
7130 // that of the kernel_launch function.
7131 //
7132 // kernel_launch_function is generated by emitKernelLaunch and has the
7133 // always_inline attribute.
7134 // void kernel_launch_function(thread_id,
7135 // structArg) alwaysinline {
7136 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7137 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7138 // offload_ptrs = load(getelementptr structArg, 0, 1)
7139 // offload_mappers = load(getelementptr structArg, 0, 2)
7140 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7141 // ; offload_mappers
7142 // call i32 @__tgt_target_kernel(...,
7143 // outlined_device_function,
7144 // ptr %kernel_args)
7145 // }
7146 // void outlined_device_function(ptr a, ptr b, ptr c) {
7147 // *a = *b + *c
7148 // }
7149 //
7150 BasicBlock *TargetTaskBodyBB =
7151 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7152 BasicBlock *TargetTaskAllocaBB =
7153 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7154
7155 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7156 TargetTaskAllocaBB->begin());
7157 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7158
7159 OutlineInfo OI;
7160 OI.EntryBB = TargetTaskAllocaBB;
7161 OI.OuterAllocaBB = AllocaIP.getBlock();
7162
7163 // Add the thread ID argument.
7166 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7167
7168 Builder.restoreIP(TargetTaskBodyIP);
7169
7170 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7171 return Err;
7172
7173 OI.ExitBB = Builder.saveIP().getBlock();
7174 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7175 DeviceID](Function &OutlinedFn) mutable {
7176 assert(OutlinedFn.getNumUses() == 1 &&
7177 "there must be a single user for the outlined function");
7178
7179 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7180 bool HasShareds = StaleCI->arg_size() > 1;
7181
7182 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7183
7184 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7185 << "\n");
7186
7187 Builder.SetInsertPoint(StaleCI);
7188
7189 // Gather the arguments for emitting the runtime call.
7190 uint32_t SrcLocStrSize;
7191 Constant *SrcLocStr =
7193 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7194
7195 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7196 //
7197 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7198 // the DeviceID to the deferred task and also since
7199 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7200 Function *TaskAllocFn =
7201 !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7203 OMPRTL___kmpc_omp_target_task_alloc);
7204
7205 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7206 // call.
7207 Value *ThreadID = getOrCreateThreadID(Ident);
7208
7209 // Argument - `sizeof_kmp_task_t` (TaskSize)
7210 // Tasksize refers to the size in bytes of kmp_task_t data structure
7211 // including private vars accessed in task.
7212 // TODO: add kmp_task_t_with_privates (privates)
7213 Value *TaskSize =
7215
7216 // Argument - `sizeof_shareds` (SharedsSize)
7217 // SharedsSize refers to the shareds array size in the kmp_task_t data
7218 // structure.
7219 Value *SharedsSize = Builder.getInt64(0);
7220 if (HasShareds) {
7221 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7222 assert(ArgStructAlloca &&
7223 "Unable to find the alloca instruction corresponding to arguments "
7224 "for extracted function");
7225 auto *ArgStructType =
7226 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7227 assert(ArgStructType && "Unable to find struct type corresponding to "
7228 "arguments for extracted function");
7229 SharedsSize =
7231 }
7232
7233 // Argument - `flags`
7234 // Task is tied iff (Flags & 1) == 1.
7235 // Task is untied iff (Flags & 1) == 0.
7236 // Task is final iff (Flags & 2) == 2.
7237 // Task is not final iff (Flags & 2) == 0.
7238 // A target task is not final and is untied.
7240
7241 // Emit the @__kmpc_omp_task_alloc runtime call
7242 // The runtime call returns a pointer to an area where the task captured
7243 // variables must be copied before the task is run (TaskData)
7244 CallInst *TaskData = nullptr;
7245
7246 SmallVector<llvm::Value *> TaskAllocArgs = {
7247 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7248 /*flags=*/Flags,
7249 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7250 /*task_func=*/ProxyFn};
7251
7252 if (HasNoWait)
7253 TaskAllocArgs.push_back(DeviceID);
7254
7255 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7256
7257 if (HasShareds) {
7258 Value *Shareds = StaleCI->getArgOperand(1);
7259 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7260 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7261 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7262 SharedsSize);
7263 }
7264
7265 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7266
7267 // ---------------------------------------------------------------
7268 // V5.2 13.8 target construct
7269 // If the nowait clause is present, execution of the target task
7270 // may be deferred. If the nowait clause is not present, the target task is
7271 // an included task.
7272 // ---------------------------------------------------------------
7273 // The above means that the lack of a nowait on the target construct
7274 // translates to '#pragma omp task if(0)'
7275 if (!HasNoWait) {
7276 if (DepArray) {
7277 Function *TaskWaitFn =
7278 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7280 TaskWaitFn,
7281 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7282 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7283 /*dep_list=*/DepArray,
7284 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7285 /*noalias_dep_list=*/
7287 }
7288 // Included task.
7289 Function *TaskBeginFn =
7290 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7291 Function *TaskCompleteFn =
7292 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7293 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7294 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7295 CI->setDebugLoc(StaleCI->getDebugLoc());
7296 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7297 } else if (DepArray) {
7298 // HasNoWait - meaning the task may be deferred. Call
7299 // __kmpc_omp_task_with_deps if there are dependencies,
7300 // else call __kmpc_omp_task
7301 Function *TaskFn =
7302 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7304 TaskFn,
7305 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7306 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7308 } else {
7309 // Emit the @__kmpc_omp_task runtime call to spawn the task
7310 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7311 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7312 }
7313
7314 StaleCI->eraseFromParent();
7315 for (Instruction *I : llvm::reverse(ToBeDeleted))
7316 I->eraseFromParent();
7317 };
7318 addOutlineInfo(std::move(OI));
7319
7320 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7321 << *(Builder.GetInsertBlock()) << "\n");
7322 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7324 << "\n");
7325 return Builder.saveIP();
7326}
7327
7329 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7330 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7331 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7332 function_ref<Value *(unsigned int)> CustomMapperCB) {
7333 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7334 DeviceAddrCB, CustomMapperCB);
7335 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7336}
7337
7338static void
7343 Function *OutlinedFn, Constant *OutlinedFnID,
7347 bool HasNoWait = false) {
7348 // Generate a function call to the host fallback implementation of the target
7349 // region. This is called by the host when no offload entry was generated for
7350 // the target region and when the offloading call fails at runtime.
7351 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7353 Builder.restoreIP(IP);
7354 Builder.CreateCall(OutlinedFn, Args);
7355 return Builder.saveIP();
7356 };
7357
7358 bool HasDependencies = Dependencies.size() > 0;
7359 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7360
7362
7363 auto TaskBodyCB =
7364 [&](Value *DeviceID, Value *RTLoc,
7365 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7366 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7367 // produce any.
7369 // emitKernelLaunch makes the necessary runtime call to offload the
7370 // kernel. We then outline all that code into a separate function
7371 // ('kernel_launch_function' in the pseudo code above). This function is
7372 // then called by the target task proxy function (see
7373 // '@.omp_target_task_proxy_func' in the pseudo code above)
7374 // "@.omp_target_task_proxy_func' is generated by
7375 // emitTargetTaskProxyFunction.
7376 if (OutlinedFnID)
7377 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7378 EmitTargetCallFallbackCB, KArgs,
7379 DeviceID, RTLoc, TargetTaskAllocaIP);
7380 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7381 // In this case, we execute the host implementation directly.
7382 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7383 }());
7384
7385 OMPBuilder.Builder.restoreIP(AfterIP);
7386 return Error::success();
7387 };
7388
7389 // If we don't have an ID for the target region, it means an offload entry
7390 // wasn't created. In this case we just run the host fallback directly.
7391 if (!OutlinedFnID) {
7392 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7393 // produce any.
7395 if (RequiresOuterTargetTask) {
7396 // Arguments that are intended to be directly forwarded to an
7397 // emitKernelLaunch call are pased as nullptr, since
7398 // OutlinedFnID=nullptr results in that call not being done.
7399 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7400 /*RTLoc=*/nullptr, AllocaIP,
7401 Dependencies, HasNoWait);
7402 }
7403 return EmitTargetCallFallbackCB(Builder.saveIP());
7404 }());
7405
7406 Builder.restoreIP(AfterIP);
7407 return;
7408 }
7409
7411 /*RequiresDevicePointerInfo=*/false,
7412 /*SeparateBeginEndCalls=*/true);
7413
7414 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7416 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7417 RTArgs, MapInfo,
7418 /*IsNonContiguous=*/true,
7419 /*ForEndCall=*/false);
7420
7421 SmallVector<Value *, 3> NumTeamsC;
7422 for (auto [DefaultVal, RuntimeVal] :
7423 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
7424 NumTeamsC.push_back(RuntimeVal ? RuntimeVal : Builder.getInt32(DefaultVal));
7425
7426 // Calculate number of threads: 0 if no clauses specified, otherwise it is the
7427 // minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
7428 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
7429 if (Clause)
7430 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
7431 /*isSigned=*/false);
7432 return Clause;
7433 };
7434 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
7435 if (Clause)
7436 Result = Result
7437 ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
7438 Result, Clause)
7439 : Clause;
7440 };
7441
7442 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
7443 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
7444 SmallVector<Value *, 3> NumThreadsC;
7445 Value *MaxThreadsClause = RuntimeAttrs.TeamsThreadLimit.size() == 1
7446 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
7447 : nullptr;
7448
7449 for (auto [TeamsVal, TargetVal] : zip_equal(RuntimeAttrs.TeamsThreadLimit,
7450 RuntimeAttrs.TargetThreadLimit)) {
7451 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
7452 Value *NumThreads = InitMaxThreadsClause(TargetVal);
7453
7454 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
7455 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
7456
7457 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
7458 }
7459
7460 unsigned NumTargetItems = Info.NumberOfPtrs;
7461 // TODO: Use correct device ID
7462 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7463 uint32_t SrcLocStrSize;
7464 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7465 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7466 llvm::omp::IdentFlag(0), 0);
7467
7468 Value *TripCount = RuntimeAttrs.LoopTripCount
7469 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
7470 Builder.getInt64Ty(),
7471 /*isSigned=*/false)
7472 : Builder.getInt64(0);
7473
7474 // TODO: Use correct DynCGGroupMem
7475 Value *DynCGGroupMem = Builder.getInt32(0);
7476
7477 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
7478 NumTeamsC, NumThreadsC,
7479 DynCGGroupMem, HasNoWait);
7480
7481 // Assume no error was returned because TaskBodyCB and
7482 // EmitTargetCallFallbackCB don't produce any.
7484 // The presence of certain clauses on the target directive require the
7485 // explicit generation of the target task.
7486 if (RequiresOuterTargetTask)
7487 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7488 Dependencies, HasNoWait);
7489
7490 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7491 EmitTargetCallFallbackCB, KArgs,
7492 DeviceID, RTLoc, AllocaIP);
7493 }());
7494
7495 Builder.restoreIP(AfterIP);
7496}
7497
7499 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7500 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7501 const TargetKernelDefaultAttrs &DefaultAttrs,
7502 const TargetKernelRuntimeAttrs &RuntimeAttrs,
7506 SmallVector<DependData> Dependencies, bool HasNowait) {
7507
7508 if (!updateToLocation(Loc))
7509 return InsertPointTy();
7510
7511 Builder.restoreIP(CodeGenIP);
7512
7513 Function *OutlinedFn;
7514 Constant *OutlinedFnID = nullptr;
7515 // The target region is outlined into its own function. The LLVM IR for
7516 // the target region itself is generated using the callbacks CBFunc
7517 // and ArgAccessorFuncCB
7519 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
7520 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
7521 return Err;
7522
7523 // If we are not on the target device, then we need to generate code
7524 // to make a remote call (offload) to the previously outlined function
7525 // that represents the target region. Do that now.
7526 if (!Config.isTargetDevice())
7527 emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs,
7528 OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
7529 HasNowait);
7530 return Builder.saveIP();
7531}
7532
7533std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7534 StringRef FirstSeparator,
7535 StringRef Separator) {
7536 SmallString<128> Buffer;
7538 StringRef Sep = FirstSeparator;
7539 for (StringRef Part : Parts) {
7540 OS << Sep << Part;
7541 Sep = Separator;
7542 }
7543 return OS.str().str();
7544}
7545
7546std::string
7548 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7549 Config.separator());
7550}
7551
7554 unsigned AddressSpace) {
7555 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7556 if (Elem.second) {
7557 assert(Elem.second->getValueType() == Ty &&
7558 "OMP internal variable has different type than requested");
7559 } else {
7560 // TODO: investigate the appropriate linkage type used for the global
7561 // variable for possibly changing that to internal or private, or maybe
7562 // create different versions of the function for different OMP internal
7563 // variables.
7564 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7567 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7568 Constant::getNullValue(Ty), Elem.first(),
7569 /*InsertBefore=*/nullptr,
7571 const DataLayout &DL = M.getDataLayout();
7572 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7573 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7574 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7575 Elem.second = GV;
7576 }
7577
7578 return Elem.second;
7579}
7580
7581Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7582 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7583 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7584 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7585}
7586
7589 Value *Null =
7590 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7591 Value *SizeGep =
7592 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7593 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7594 return SizePtrToInt;
7595}
7596
7599 std::string VarName) {
7600 llvm::Constant *MaptypesArrayInit =
7602 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7603 M, MaptypesArrayInit->getType(),
7604 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7605 VarName);
7606 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7607 return MaptypesArrayGlobal;
7608}
7609
7611 InsertPointTy AllocaIP,
7612 unsigned NumOperands,
7613 struct MapperAllocas &MapperAllocas) {
7614 if (!updateToLocation(Loc))
7615 return;
7616
7617 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7618 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7619 Builder.restoreIP(AllocaIP);
7620 AllocaInst *ArgsBase = Builder.CreateAlloca(
7621 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7622 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7623 ".offload_ptrs");
7624 AllocaInst *ArgSizes = Builder.CreateAlloca(
7625 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7626 Builder.restoreIP(Loc.IP);
7627 MapperAllocas.ArgsBase = ArgsBase;
7628 MapperAllocas.Args = Args;
7629 MapperAllocas.ArgSizes = ArgSizes;
7630}
7631
7633 Function *MapperFunc, Value *SrcLocInfo,
7634 Value *MaptypesArg, Value *MapnamesArg,
7636 int64_t DeviceID, unsigned NumOperands) {
7637 if (!updateToLocation(Loc))
7638 return;
7639
7640 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7641 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7642 Value *ArgsBaseGEP =
7644 {Builder.getInt32(0), Builder.getInt32(0)});
7645 Value *ArgsGEP =
7647 {Builder.getInt32(0), Builder.getInt32(0)});
7648 Value *ArgSizesGEP =
7650 {Builder.getInt32(0), Builder.getInt32(0)});
7651 Value *NullPtr =
7652 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7653 Builder.CreateCall(MapperFunc,
7654 {SrcLocInfo, Builder.getInt64(DeviceID),
7655 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7656 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7657}
7658
7660 TargetDataRTArgs &RTArgs,
7661 TargetDataInfo &Info,
7662 bool ForEndCall) {
7663 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7664 "expected region end call to runtime only when end call is separate");
7665 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7666 auto VoidPtrTy = UnqualPtrTy;
7667 auto VoidPtrPtrTy = UnqualPtrTy;
7668 auto Int64Ty = Type::getInt64Ty(M.getContext());
7669 auto Int64PtrTy = UnqualPtrTy;
7670
7671 if (!Info.NumberOfPtrs) {
7672 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7673 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7674 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7675 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7676 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7677 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7678 return;
7679 }
7680
7682 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7683 Info.RTArgs.BasePointersArray,
7684 /*Idx0=*/0, /*Idx1=*/0);
7686 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7687 /*Idx0=*/0,
7688 /*Idx1=*/0);
7690 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7691 /*Idx0=*/0, /*Idx1=*/0);
7693 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7694 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7695 : Info.RTArgs.MapTypesArray,
7696 /*Idx0=*/0,
7697 /*Idx1=*/0);
7698
7699 // Only emit the mapper information arrays if debug information is
7700 // requested.
7701 if (!Info.EmitDebug)
7702 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7703 else
7705 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7706 /*Idx0=*/0,
7707 /*Idx1=*/0);
7708 // If there is no user-defined mapper, set the mapper array to nullptr to
7709 // avoid an unnecessary data privatization
7710 if (!Info.HasMapper)
7711 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7712 else
7713 RTArgs.MappersArray =
7714 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7715}
7716
7718 InsertPointTy CodeGenIP,
7719 MapInfosTy &CombinedInfo,
7720 TargetDataInfo &Info) {
7722 CombinedInfo.NonContigInfo;
7723
7724 // Build an array of struct descriptor_dim and then assign it to
7725 // offload_args.
7726 //
7727 // struct descriptor_dim {
7728 // uint64_t offset;
7729 // uint64_t count;
7730 // uint64_t stride
7731 // };
7732 Type *Int64Ty = Builder.getInt64Ty();
7734 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7735 "struct.descriptor_dim");
7736
7737 enum { OffsetFD = 0, CountFD, StrideFD };
7738 // We need two index variable here since the size of "Dims" is the same as
7739 // the size of Components, however, the size of offset, count, and stride is
7740 // equal to the size of base declaration that is non-contiguous.
7741 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7742 // Skip emitting ir if dimension size is 1 since it cannot be
7743 // non-contiguous.
7744 if (NonContigInfo.Dims[I] == 1)
7745 continue;
7746 Builder.restoreIP(AllocaIP);
7747 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7748 AllocaInst *DimsAddr =
7749 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7750 Builder.restoreIP(CodeGenIP);
7751 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7752 unsigned RevIdx = EE - II - 1;
7753 Value *DimsLVal = Builder.CreateInBoundsGEP(
7754 DimsAddr->getAllocatedType(), DimsAddr,
7755 {Builder.getInt64(0), Builder.getInt64(II)});
7756 // Offset
7757 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7759 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7760 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7761 // Count
7762 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7764 NonContigInfo.Counts[L][RevIdx], CountLVal,
7765 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7766 // Stride
7767 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7769 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7770 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7771 }
7772 // args[I] = &dims
7773 Builder.restoreIP(CodeGenIP);
7775 DimsAddr, Builder.getPtrTy());
7777 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7778 Info.RTArgs.PointersArray, 0, I);
7781 ++L;
7782 }
7783}
7784
7785void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7786 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7787 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7788 BasicBlock *ExitBB, bool IsInit) {
7789 StringRef Prefix = IsInit ? ".init" : ".del";
7790
7791 // Evaluate if this is an array section.
7793 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7794 Value *IsArray =
7795 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7796 Value *DeleteBit = Builder.CreateAnd(
7797 MapType,
7799 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7800 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7801 Value *DeleteCond;
7802 Value *Cond;
7803 if (IsInit) {
7804 // base != begin?
7805 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7806 // IsPtrAndObj?
7807 Value *PtrAndObjBit = Builder.CreateAnd(
7808 MapType,
7810 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7811 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7812 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7813 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7814 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7815 DeleteCond = Builder.CreateIsNull(
7816 DeleteBit,
7817 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7818 } else {
7819 Cond = IsArray;
7820 DeleteCond = Builder.CreateIsNotNull(
7821 DeleteBit,
7822 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7823 }
7824 Cond = Builder.CreateAnd(Cond, DeleteCond);
7825 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7826
7827 emitBlock(BodyBB, MapperFn);
7828 // Get the array size by multiplying element size and element number (i.e., \p
7829 // Size).
7830 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7831 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7832 // memory allocation/deletion purpose only.
7833 Value *MapTypeArg = Builder.CreateAnd(
7834 MapType,
7836 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7837 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7838 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7839 MapTypeArg = Builder.CreateOr(
7840 MapTypeArg,
7842 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7843 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7844
7845 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7846 // data structure.
7847 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7848 ArraySize, MapTypeArg, MapName};
7850 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7851 OffloadingArgs);
7852}
7853
7855 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7856 llvm::Value *BeginArg)>
7857 GenMapInfoCB,
7858 Type *ElemTy, StringRef FuncName,
7859 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7860 SmallVector<Type *> Params;
7861 Params.emplace_back(Builder.getPtrTy());
7862 Params.emplace_back(Builder.getPtrTy());
7863 Params.emplace_back(Builder.getPtrTy());
7866 Params.emplace_back(Builder.getPtrTy());
7867
7868 auto *FnTy =
7869 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7870
7871 SmallString<64> TyStr;
7872 raw_svector_ostream Out(TyStr);
7873 Function *MapperFn =
7875 MapperFn->addFnAttr(Attribute::NoInline);
7876 MapperFn->addFnAttr(Attribute::NoUnwind);
7877 MapperFn->addParamAttr(0, Attribute::NoUndef);
7878 MapperFn->addParamAttr(1, Attribute::NoUndef);
7879 MapperFn->addParamAttr(2, Attribute::NoUndef);
7880 MapperFn->addParamAttr(3, Attribute::NoUndef);
7881 MapperFn->addParamAttr(4, Attribute::NoUndef);
7882 MapperFn->addParamAttr(5, Attribute::NoUndef);
7883
7884 // Start the mapper function code generation.
7885 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7886 auto SavedIP = Builder.saveIP();
7887 Builder.SetInsertPoint(EntryBB);
7888
7889 Value *MapperHandle = MapperFn->getArg(0);
7890 Value *BaseIn = MapperFn->getArg(1);
7891 Value *BeginIn = MapperFn->getArg(2);
7892 Value *Size = MapperFn->getArg(3);
7893 Value *MapType = MapperFn->getArg(4);
7894 Value *MapName = MapperFn->getArg(5);
7895
7896 // Compute the starting and end addresses of array elements.
7897 // Prepare common arguments for array initiation and deletion.
7898 // Convert the size in bytes into the number of array elements.
7899 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7901 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7902 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7903
7904 // Emit array initiation if this is an array section and \p MapType indicates
7905 // that memory allocation is required.
7906 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
7907 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7908 MapType, MapName, ElementSize, HeadBB,
7909 /*IsInit=*/true);
7910
7911 // Emit a for loop to iterate through SizeArg of elements and map all of them.
7912
7913 // Emit the loop header block.
7914 emitBlock(HeadBB, MapperFn);
7915 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
7916 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
7917 // Evaluate whether the initial condition is satisfied.
7918 Value *IsEmpty =
7919 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
7920 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
7921
7922 // Emit the loop body block.
7923 emitBlock(BodyBB, MapperFn);
7924 BasicBlock *LastBB = BodyBB;
7925 PHINode *PtrPHI =
7926 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
7927 PtrPHI->addIncoming(PtrBegin, HeadBB);
7928
7929 // Get map clause information. Fill up the arrays with all mapped variables.
7930 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
7931
7932 // Call the runtime API __tgt_mapper_num_components to get the number of
7933 // pre-existing components.
7934 Value *OffloadingArgs[] = {MapperHandle};
7935 Value *PreviousSize = Builder.CreateCall(
7936 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
7937 OffloadingArgs);
7938 Value *ShiftedPreviousSize =
7940
7941 // Fill up the runtime mapper handle for all components.
7942 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
7943 Value *CurBaseArg =
7944 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
7945 Value *CurBeginArg =
7947 Value *CurSizeArg = Info.Sizes[I];
7948 Value *CurNameArg = Info.Names.size()
7949 ? Info.Names[I]
7951
7952 // Extract the MEMBER_OF field from the map type.
7953 Value *OriMapType = Builder.getInt64(
7954 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7955 Info.Types[I]));
7956 Value *MemberMapType =
7957 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
7958
7959 // Combine the map type inherited from user-defined mapper with that
7960 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
7961 // bits of the \a MapType, which is the input argument of the mapper
7962 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
7963 // bits of MemberMapType.
7964 // [OpenMP 5.0], 1.2.6. map-type decay.
7965 // | alloc | to | from | tofrom | release | delete
7966 // ----------------------------------------------------------
7967 // alloc | alloc | alloc | alloc | alloc | release | delete
7968 // to | alloc | to | alloc | to | release | delete
7969 // from | alloc | alloc | from | from | release | delete
7970 // tofrom | alloc | to | from | tofrom | release | delete
7971 Value *LeftToFrom = Builder.CreateAnd(
7972 MapType,
7974 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7975 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7976 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7977 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
7978 BasicBlock *AllocElseBB =
7979 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
7980 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
7981 BasicBlock *ToElseBB =
7982 BasicBlock::Create(M.getContext(), "omp.type.to.else");
7983 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
7984 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
7985 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
7986 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
7987 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
7988 emitBlock(AllocBB, MapperFn);
7989 Value *AllocMapType = Builder.CreateAnd(
7990 MemberMapType,
7992 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7993 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7994 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7995 Builder.CreateBr(EndBB);
7996 emitBlock(AllocElseBB, MapperFn);
7997 Value *IsTo = Builder.CreateICmpEQ(
7998 LeftToFrom,
8000 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8001 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8002 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8003 // In case of to, clear OMP_MAP_FROM.
8004 emitBlock(ToBB, MapperFn);
8005 Value *ToMapType = Builder.CreateAnd(
8006 MemberMapType,
8008 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8009 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8010 Builder.CreateBr(EndBB);
8011 emitBlock(ToElseBB, MapperFn);
8012 Value *IsFrom = Builder.CreateICmpEQ(
8013 LeftToFrom,
8015 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8016 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8017 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8018 // In case of from, clear OMP_MAP_TO.
8019 emitBlock(FromBB, MapperFn);
8020 Value *FromMapType = Builder.CreateAnd(
8021 MemberMapType,
8023 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8024 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8025 // In case of tofrom, do nothing.
8026 emitBlock(EndBB, MapperFn);
8027 LastBB = EndBB;
8028 PHINode *CurMapType =
8029 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8030 CurMapType->addIncoming(AllocMapType, AllocBB);
8031 CurMapType->addIncoming(ToMapType, ToBB);
8032 CurMapType->addIncoming(FromMapType, FromBB);
8033 CurMapType->addIncoming(MemberMapType, ToElseBB);
8034
8035 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8036 CurSizeArg, CurMapType, CurNameArg};
8037 Function *ChildMapperFn = nullptr;
8038 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
8039 // Call the corresponding mapper function.
8040 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8041 } else {
8042 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8043 // data structure.
8045 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8046 OffloadingArgs);
8047 }
8048 }
8049
8050 // Update the pointer to point to the next element that needs to be mapped,
8051 // and check whether we have mapped all elements.
8052 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8053 "omp.arraymap.next");
8054 PtrPHI->addIncoming(PtrNext, LastBB);
8055 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8056 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8057 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8058
8059 emitBlock(ExitBB, MapperFn);
8060 // Emit array deletion if this is an array section and \p MapType indicates
8061 // that deletion is required.
8062 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8063 MapType, MapName, ElementSize, DoneBB,
8064 /*IsInit=*/false);
8065
8066 // Emit the function exit block.
8067 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8068
8070 Builder.restoreIP(SavedIP);
8071 return MapperFn;
8072}
8073
8075 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8076 TargetDataInfo &Info, bool IsNonContiguous,
8077 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8078 function_ref<Value *(unsigned int)> CustomMapperCB) {
8079
8080 // Reset the array information.
8081 Info.clearArrayInfo();
8082 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8083
8084 if (Info.NumberOfPtrs == 0)
8085 return;
8086
8087 Builder.restoreIP(AllocaIP);
8088 // Detect if we have any capture size requiring runtime evaluation of the
8089 // size so that a constant array could be eventually used.
8090 ArrayType *PointerArrayType =
8091 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8092
8093 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8094 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8095
8096 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8097 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8098 AllocaInst *MappersArray = Builder.CreateAlloca(
8099 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8100 Info.RTArgs.MappersArray = MappersArray;
8101
8102 // If we don't have any VLA types or other types that require runtime
8103 // evaluation, we can use a constant array for the map sizes, otherwise we
8104 // need to fill up the arrays as we do for the pointers.
8105 Type *Int64Ty = Builder.getInt64Ty();
8106 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8107 ConstantInt::get(Int64Ty, 0));
8108 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8109 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8110 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8111 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8112 if (IsNonContiguous &&
8113 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8114 CombinedInfo.Types[I] &
8115 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8116 ConstSizes[I] =
8117 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8118 else
8119 ConstSizes[I] = CI;
8120 continue;
8121 }
8122 }
8123 RuntimeSizes.set(I);
8124 }
8125
8126 if (RuntimeSizes.all()) {
8127 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8128 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8129 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8130 Builder.restoreIP(CodeGenIP);
8131 } else {
8132 auto *SizesArrayInit = ConstantArray::get(
8133 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8134 std::string Name = createPlatformSpecificName({"offload_sizes"});
8135 auto *SizesArrayGbl =
8136 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8137 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8138 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8139
8140 if (!RuntimeSizes.any()) {
8141 Info.RTArgs.SizesArray = SizesArrayGbl;
8142 } else {
8143 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8144 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8145 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8147 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8148 Buffer->setAlignment(OffloadSizeAlign);
8149 Builder.restoreIP(CodeGenIP);
8151 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8152 SizesArrayGbl, OffloadSizeAlign,
8154 IndexSize,
8155 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8156
8157 Info.RTArgs.SizesArray = Buffer;
8158 }
8159 Builder.restoreIP(CodeGenIP);
8160 }
8161
8162 // The map types are always constant so we don't need to generate code to
8163 // fill arrays. Instead, we create an array constant.
8165 for (auto mapFlag : CombinedInfo.Types)
8166 Mapping.push_back(
8167 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8168 mapFlag));
8169 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8170 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8171 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8172
8173 // The information types are only built if provided.
8174 if (!CombinedInfo.Names.empty()) {
8175 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8176 auto *MapNamesArrayGbl =
8177 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8178 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8179 Info.EmitDebug = true;
8180 } else {
8181 Info.RTArgs.MapNamesArray =
8183 Info.EmitDebug = false;
8184 }
8185
8186 // If there's a present map type modifier, it must not be applied to the end
8187 // of a region, so generate a separate map type array in that case.
8188 if (Info.separateBeginEndCalls()) {
8189 bool EndMapTypesDiffer = false;
8190 for (uint64_t &Type : Mapping) {
8191 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8192 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8193 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8194 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8195 EndMapTypesDiffer = true;
8196 }
8197 }
8198 if (EndMapTypesDiffer) {
8199 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8200 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8201 }
8202 }
8203
8204 PointerType *PtrTy = Builder.getPtrTy();
8205 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8206 Value *BPVal = CombinedInfo.BasePointers[I];
8208 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8209 0, I);
8210 Builder.CreateAlignedStore(BPVal, BP,
8212
8213 if (Info.requiresDevicePointerInfo()) {
8214 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8215 CodeGenIP = Builder.saveIP();
8216 Builder.restoreIP(AllocaIP);
8217 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8218 Builder.restoreIP(CodeGenIP);
8219 if (DeviceAddrCB)
8220 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8221 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8222 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8223 if (DeviceAddrCB)
8224 DeviceAddrCB(I, BP);
8225 }
8226 }
8227
8228 Value *PVal = CombinedInfo.Pointers[I];
8230 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8231 I);
8232 // TODO: Check alignment correct.
8235
8236 if (RuntimeSizes.test(I)) {
8238 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8239 /*Idx0=*/0,
8240 /*Idx1=*/I);
8242 Int64Ty,
8243 /*isSigned=*/true),
8244 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8245 }
8246 // Fill up the mapper array.
8247 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8248 Value *MFunc = ConstantPointerNull::get(PtrTy);
8249 if (CustomMapperCB)
8250 if (Value *CustomMFunc = CustomMapperCB(I))
8251 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8253 MappersArray->getAllocatedType(), MappersArray,
8254 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8256 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8257 }
8258
8259 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8260 Info.NumberOfPtrs == 0)
8261 return;
8262 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8263}
8264
8267
8268 if (!CurBB || CurBB->getTerminator()) {
8269 // If there is no insert point or the previous block is already
8270 // terminated, don't touch it.
8271 } else {
8272 // Otherwise, create a fall-through branch.
8274 }
8275
8277}
8278
8280 bool IsFinished) {
8282
8283 // Fall out of the current block (if necessary).
8284 emitBranch(BB);
8285
8286 if (IsFinished && BB->use_empty()) {
8287 BB->eraseFromParent();
8288 return;
8289 }
8290
8291 // Place the block after the current block, if possible, or else at
8292 // the end of the function.
8293 if (CurBB && CurBB->getParent())
8294 CurFn->insert(std::next(CurBB->getIterator()), BB);
8295 else
8296 CurFn->insert(CurFn->end(), BB);
8298}
8299
8301 BodyGenCallbackTy ElseGen,
8302 InsertPointTy AllocaIP) {
8303 // If the condition constant folds and can be elided, try to avoid emitting
8304 // the condition and the dead arm of the if/else.
8305 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8306 auto CondConstant = CI->getSExtValue();
8307 if (CondConstant)
8308 return ThenGen(AllocaIP, Builder.saveIP());
8309
8310 return ElseGen(AllocaIP, Builder.saveIP());
8311 }
8312
8314
8315 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8316 // emit the conditional branch.
8317 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8318 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8319 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8320 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8321 // Emit the 'then' code.
8322 emitBlock(ThenBlock, CurFn);
8323 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8324 return Err;
8325 emitBranch(ContBlock);
8326 // Emit the 'else' code if present.
8327 // There is no need to emit line number for unconditional branch.
8328 emitBlock(ElseBlock, CurFn);
8329 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8330 return Err;
8331 // There is no need to emit line number for unconditional branch.
8332 emitBranch(ContBlock);
8333 // Emit the continuation block for code after the if.
8334 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8335 return Error::success();
8336}
8337
8338bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8339 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8342 "Unexpected Atomic Ordering.");
8343
8344 bool Flush = false;
8346
8347 switch (AK) {
8348 case Read:
8351 FlushAO = AtomicOrdering::Acquire;
8352 Flush = true;
8353 }
8354 break;
8355 case Write:
8356 case Compare:
8357 case Update:
8360 FlushAO = AtomicOrdering::Release;
8361 Flush = true;
8362 }
8363 break;
8364 case Capture:
8365 switch (AO) {
8367 FlushAO = AtomicOrdering::Acquire;
8368 Flush = true;
8369 break;
8371 FlushAO = AtomicOrdering::Release;
8372 Flush = true;
8373 break;
8377 Flush = true;
8378 break;
8379 default:
8380 // do nothing - leave silently.
8381 break;
8382 }
8383 }
8384
8385 if (Flush) {
8386 // Currently Flush RT call still doesn't take memory_ordering, so for when
8387 // that happens, this tries to do the resolution of which atomic ordering
8388 // to use with but issue the flush call
8389 // TODO: pass `FlushAO` after memory ordering support is added
8390 (void)FlushAO;
8391 emitFlush(Loc);
8392 }
8393
8394 // for AO == AtomicOrdering::Monotonic and all other case combinations
8395 // do nothing
8396 return Flush;
8397}
8398
8402 AtomicOrdering AO) {
8403 if (!updateToLocation(Loc))
8404 return Loc.IP;
8405
8406 assert(X.Var->getType()->isPointerTy() &&
8407 "OMP Atomic expects a pointer to target memory");
8408 Type *XElemTy = X.ElemTy;
8409 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8410 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8411 "OMP atomic read expected a scalar type");
8412
8413 Value *XRead = nullptr;
8414
8415 if (XElemTy->isIntegerTy()) {
8416 LoadInst *XLD =
8417 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8418 XLD->setAtomic(AO);
8419 XRead = cast<Value>(XLD);
8420 } else if (XElemTy->isStructTy()) {
8421 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8422 // target does not support `atomicrmw` of the size of the struct
8423 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8424 OldVal->setAtomic(AO);
8425 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8426 unsigned LoadSize =
8427 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8428 OpenMPIRBuilder::AtomicInfo atomicInfo(
8429 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8430 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8431 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8432 XRead = AtomicLoadRes.first;
8433 OldVal->eraseFromParent();
8434 } else {
8435 // We need to perform atomic op as integer
8436 IntegerType *IntCastTy =
8438 LoadInst *XLoad =
8439 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8440 XLoad->setAtomic(AO);
8441 if (XElemTy->isFloatingPointTy()) {
8442 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8443 } else {
8444 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8445 }
8446 }
8447 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8448 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8449 return Builder.saveIP();
8450}
8451
8454 AtomicOpValue &X, Value *Expr,
8455 AtomicOrdering AO) {
8456 if (!updateToLocation(Loc))
8457 return Loc.IP;
8458
8459 assert(X.Var->getType()->isPointerTy() &&
8460 "OMP Atomic expects a pointer to target memory");
8461 Type *XElemTy = X.ElemTy;
8462 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8463 XElemTy->isPointerTy()) &&
8464 "OMP atomic write expected a scalar type");
8465
8466 if (XElemTy->isIntegerTy()) {
8467 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8468 XSt->setAtomic(AO);
8469 } else {
8470 // We need to bitcast and perform atomic op as integers
8471 IntegerType *IntCastTy =
8473 Value *ExprCast =
8474 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8475 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8476 XSt->setAtomic(AO);
8477 }
8478
8479 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8480 return Builder.saveIP();
8481}
8482
8484 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8485 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8486 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8487 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8488 if (!updateToLocation(Loc))
8489 return Loc.IP;
8490
8491 LLVM_DEBUG({
8492 Type *XTy = X.Var->getType();
8493 assert(XTy->isPointerTy() &&
8494 "OMP Atomic expects a pointer to target memory");
8495 Type *XElemTy = X.ElemTy;
8496 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8497 XElemTy->isPointerTy()) &&
8498 "OMP atomic update expected a scalar type");
8499 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8500 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8501 "OpenMP atomic does not support LT or GT operations");
8502 });
8503
8505 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8506 X.IsVolatile, IsXBinopExpr);
8507 if (!AtomicResult)
8508 return AtomicResult.takeError();
8509 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8510 return Builder.saveIP();
8511}
8512
8513// FIXME: Duplicating AtomicExpand
8514Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8515 AtomicRMWInst::BinOp RMWOp) {
8516 switch (RMWOp) {
8517 case AtomicRMWInst::Add:
8518 return Builder.CreateAdd(Src1, Src2);
8519 case AtomicRMWInst::Sub:
8520 return Builder.CreateSub(Src1, Src2);
8521 case AtomicRMWInst::And:
8522 return Builder.CreateAnd(Src1, Src2);
8524 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8525 case AtomicRMWInst::Or:
8526 return Builder.CreateOr(Src1, Src2);
8527 case AtomicRMWInst::Xor:
8528 return Builder.CreateXor(Src1, Src2);
8533 case AtomicRMWInst::Max:
8534 case AtomicRMWInst::Min:
8543 llvm_unreachable("Unsupported atomic update operation");
8544 }
8545 llvm_unreachable("Unsupported atomic update operation");
8546}
8547
8548Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8549 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8551 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8552 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8553 // or a complex datatype.
8554 bool emitRMWOp = false;
8555 switch (RMWOp) {
8556 case AtomicRMWInst::Add:
8557 case AtomicRMWInst::And:
8559 case AtomicRMWInst::Or:
8560 case AtomicRMWInst::Xor:
8562 emitRMWOp = XElemTy;
8563 break;
8564 case AtomicRMWInst::Sub:
8565 emitRMWOp = (IsXBinopExpr && XElemTy);
8566 break;
8567 default:
8568 emitRMWOp = false;
8569 }
8570 emitRMWOp &= XElemTy->isIntegerTy();
8571
8572 std::pair<Value *, Value *> Res;
8573 if (emitRMWOp) {
8574 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8575 // not needed except in case of postfix captures. Generate anyway for
8576 // consistency with the else part. Will be removed with any DCE pass.
8577 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8578 if (RMWOp == AtomicRMWInst::Xchg)
8579 Res.second = Res.first;
8580 else
8581 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8582 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8583 XElemTy->isStructTy()) {
8584 LoadInst *OldVal =
8585 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8586 OldVal->setAtomic(AO);
8587 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8588 unsigned LoadSize =
8589 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8590
8591 OpenMPIRBuilder::AtomicInfo atomicInfo(
8592 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8593 OldVal->getAlign(), true /* UseLibcall */, X);
8594 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8596 Instruction *CurBBTI = CurBB->getTerminator();
8597 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8598 BasicBlock *ExitBB =
8599 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8600 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8601 X->getName() + ".atomic.cont");
8602 ContBB->getTerminator()->eraseFromParent();
8603 Builder.restoreIP(AllocaIP);
8604 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8605 NewAtomicAddr->setName(X->getName() + "x.new.val");
8606 Builder.SetInsertPoint(ContBB);
8607 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8608 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8609 Value *OldExprVal = PHI;
8610 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8611 if (!CBResult)
8612 return CBResult.takeError();
8613 Value *Upd = *CBResult;
8614 Builder.CreateStore(Upd, NewAtomicAddr);
8617 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8618 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8619 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8620 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8621 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8622 OldVal->eraseFromParent();
8623 Res.first = OldExprVal;
8624 Res.second = Upd;
8625
8626 if (UnreachableInst *ExitTI =
8627 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8628 CurBBTI->eraseFromParent();
8629 Builder.SetInsertPoint(ExitBB);
8630 } else {
8631 Builder.SetInsertPoint(ExitTI);
8632 }
8633 } else {
8634 IntegerType *IntCastTy =
8636 LoadInst *OldVal =
8637 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8638 OldVal->setAtomic(AO);
8639 // CurBB
8640 // | /---\
8641 // ContBB |
8642 // | \---/
8643 // ExitBB
8645 Instruction *CurBBTI = CurBB->getTerminator();
8646 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8647 BasicBlock *ExitBB =
8648 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8649 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8650 X->getName() + ".atomic.cont");
8651 ContBB->getTerminator()->eraseFromParent();
8652 Builder.restoreIP(AllocaIP);
8653 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8654 NewAtomicAddr->setName(X->getName() + "x.new.val");
8655 Builder.SetInsertPoint(ContBB);
8656 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8657 PHI->addIncoming(OldVal, CurBB);
8658 bool IsIntTy = XElemTy->isIntegerTy();
8659 Value *OldExprVal = PHI;
8660 if (!IsIntTy) {
8661 if (XElemTy->isFloatingPointTy()) {
8662 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8663 X->getName() + ".atomic.fltCast");
8664 } else {
8665 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8666 X->getName() + ".atomic.ptrCast");
8667 }
8668 }
8669
8670 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8671 if (!CBResult)
8672 return CBResult.takeError();
8673 Value *Upd = *CBResult;
8674 Builder.CreateStore(Upd, NewAtomicAddr);
8675 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8679 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8680 Result->setVolatile(VolatileX);
8681 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8682 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8683 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8684 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8685
8686 Res.first = OldExprVal;
8687 Res.second = Upd;
8688
8689 // set Insertion point in exit block
8690 if (UnreachableInst *ExitTI =
8691 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8692 CurBBTI->eraseFromParent();
8693 Builder.SetInsertPoint(ExitBB);
8694 } else {
8695 Builder.SetInsertPoint(ExitTI);
8696 }
8697 }
8698
8699 return Res;
8700}
8701
8703 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8704 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8706 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8707 if (!updateToLocation(Loc))
8708 return Loc.IP;
8709
8710 LLVM_DEBUG({
8711 Type *XTy = X.Var->getType();
8712 assert(XTy->isPointerTy() &&
8713 "OMP Atomic expects a pointer to target memory");
8714 Type *XElemTy = X.ElemTy;
8715 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8716 XElemTy->isPointerTy()) &&
8717 "OMP atomic capture expected a scalar type");
8718 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8719 "OpenMP atomic does not support LT or GT operations");
8720 });
8721
8722 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8723 // 'x' is simply atomically rewritten with 'expr'.
8724 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8726 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8727 X.IsVolatile, IsXBinopExpr);
8728 if (!AtomicResult)
8729 return AtomicResult.takeError();
8730 Value *CapturedVal =
8731 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8732 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8733
8734 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8735 return Builder.saveIP();
8736}
8737
8741 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8742 bool IsFailOnly) {
8743
8745 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8746 IsPostfixUpdate, IsFailOnly, Failure);
8747}
8748
8752 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8753 bool IsFailOnly, AtomicOrdering Failure) {
8754
8755 if (!updateToLocation(Loc))
8756 return Loc.IP;
8757
8758 assert(X.Var->getType()->isPointerTy() &&
8759 "OMP atomic expects a pointer to target memory");
8760 // compare capture
8761 if (V.Var) {
8762 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8763 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8764 }
8765
8766 bool IsInteger = E->getType()->isIntegerTy();
8767
8768 if (Op == OMPAtomicCompareOp::EQ) {
8769 AtomicCmpXchgInst *Result = nullptr;
8770 if (!IsInteger) {
8771 IntegerType *IntCastTy =
8772 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8773 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8774 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8775 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8776 AO, Failure);
8777 } else {
8778 Result =
8779 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8780 }
8781
8782 if (V.Var) {
8783 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8784 if (!IsInteger)
8785 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8786 assert(OldValue->getType() == V.ElemTy &&
8787 "OldValue and V must be of same type");
8788 if (IsPostfixUpdate) {
8789 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8790 } else {
8791 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8792 if (IsFailOnly) {
8793 // CurBB----
8794 // | |
8795 // v |
8796 // ContBB |
8797 // | |
8798 // v |
8799 // ExitBB <-
8800 //
8801 // where ContBB only contains the store of old value to 'v'.
8803 Instruction *CurBBTI = CurBB->getTerminator();
8804 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8805 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8806 CurBBTI, X.Var->getName() + ".atomic.exit");
8807 BasicBlock *ContBB = CurBB->splitBasicBlock(
8808 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8809 ContBB->getTerminator()->eraseFromParent();
8810 CurBB->getTerminator()->eraseFromParent();
8811
8812 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8813
8814 Builder.SetInsertPoint(ContBB);
8815 Builder.CreateStore(OldValue, V.Var);
8816 Builder.CreateBr(ExitBB);
8817
8818 if (UnreachableInst *ExitTI =
8819 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8820 CurBBTI->eraseFromParent();
8821 Builder.SetInsertPoint(ExitBB);
8822 } else {
8823 Builder.SetInsertPoint(ExitTI);
8824 }
8825 } else {
8826 Value *CapturedValue =
8827 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8828 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8829 }
8830 }
8831 }
8832 // The comparison result has to be stored.
8833 if (R.Var) {
8834 assert(R.Var->getType()->isPointerTy() &&
8835 "r.var must be of pointer type");
8836 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8837
8838 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8839 Value *ResultCast = R.IsSigned
8840 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8841 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8842 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8843 }
8844 } else {
8845 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8846 "Op should be either max or min at this point");
8847 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8848
8849 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8850 // Let's take max as example.
8851 // OpenMP form:
8852 // x = x > expr ? expr : x;
8853 // LLVM form:
8854 // *ptr = *ptr > val ? *ptr : val;
8855 // We need to transform to LLVM form.
8856 // x = x <= expr ? x : expr;
8858 if (IsXBinopExpr) {
8859 if (IsInteger) {
8860 if (X.IsSigned)
8861 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8863 else
8864 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8866 } else {
8867 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8869 }
8870 } else {
8871 if (IsInteger) {
8872 if (X.IsSigned)
8873 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8875 else
8876 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8878 } else {
8879 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8881 }
8882 }
8883
8884 AtomicRMWInst *OldValue =
8885 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8886 if (V.Var) {
8887 Value *CapturedValue = nullptr;
8888 if (IsPostfixUpdate) {
8889 CapturedValue = OldValue;
8890 } else {
8891 CmpInst::Predicate Pred;
8892 switch (NewOp) {
8893 case AtomicRMWInst::Max:
8894 Pred = CmpInst::ICMP_SGT;
8895 break;
8897 Pred = CmpInst::ICMP_UGT;
8898 break;
8900 Pred = CmpInst::FCMP_OGT;
8901 break;
8902 case AtomicRMWInst::Min:
8903 Pred = CmpInst::ICMP_SLT;
8904 break;
8906 Pred = CmpInst::ICMP_ULT;
8907 break;
8909 Pred = CmpInst::FCMP_OLT;
8910 break;
8911 default:
8912 llvm_unreachable("unexpected comparison op");
8913 }
8914 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8915 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8916 }
8917 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8918 }
8919 }
8920
8921 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8922
8923 return Builder.saveIP();
8924}
8925
8928 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8929 Value *NumTeamsUpper, Value *ThreadLimit,
8930 Value *IfExpr) {
8931 if (!updateToLocation(Loc))
8932 return InsertPointTy();
8933
8934 uint32_t SrcLocStrSize;
8935 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8936 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8937 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8938
8939 // Outer allocation basicblock is the entry block of the current function.
8940 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8941 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8942 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8943 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8944 }
8945
8946 // The current basic block is split into four basic blocks. After outlining,
8947 // they will be mapped as follows:
8948 // ```
8949 // def current_fn() {
8950 // current_basic_block:
8951 // br label %teams.exit
8952 // teams.exit:
8953 // ; instructions after teams
8954 // }
8955 //
8956 // def outlined_fn() {
8957 // teams.alloca:
8958 // br label %teams.body
8959 // teams.body:
8960 // ; instructions within teams body
8961 // }
8962 // ```
8963 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8964 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8965 BasicBlock *AllocaBB =
8966 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8967
8968 bool SubClausesPresent =
8969 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8970 // Push num_teams
8971 if (!Config.isTargetDevice() && SubClausesPresent) {
8972 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8973 "if lowerbound is non-null, then upperbound must also be non-null "
8974 "for bounds on num_teams");
8975
8976 if (NumTeamsUpper == nullptr)
8977 NumTeamsUpper = Builder.getInt32(0);
8978
8979 if (NumTeamsLower == nullptr)
8980 NumTeamsLower = NumTeamsUpper;
8981
8982 if (IfExpr) {
8983 assert(IfExpr->getType()->isIntegerTy() &&
8984 "argument to if clause must be an integer value");
8985
8986 // upper = ifexpr ? upper : 1
8987 if (IfExpr->getType() != Int1)
8988 IfExpr = Builder.CreateICmpNE(IfExpr,
8989 ConstantInt::get(IfExpr->getType(), 0));
8990 NumTeamsUpper = Builder.CreateSelect(
8991 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8992
8993 // lower = ifexpr ? lower : 1
8994 NumTeamsLower = Builder.CreateSelect(
8995 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8996 }
8997
8998 if (ThreadLimit == nullptr)
8999 ThreadLimit = Builder.getInt32(0);
9000
9001 Value *ThreadNum = getOrCreateThreadID(Ident);
9003 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9004 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9005 }
9006 // Generate the body of teams.
9007 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9008 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9009 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9010 return Err;
9011
9012 OutlineInfo OI;
9013 OI.EntryBB = AllocaBB;
9014 OI.ExitBB = ExitBB;
9015 OI.OuterAllocaBB = &OuterAllocaBB;
9016
9017 // Insert fake values for global tid and bound tid.
9019 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9021 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9023 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9024
9025 auto HostPostOutlineCB = [this, Ident,
9026 ToBeDeleted](Function &OutlinedFn) mutable {
9027 // The stale call instruction will be replaced with a new call instruction
9028 // for runtime call with the outlined function.
9029
9030 assert(OutlinedFn.getNumUses() == 1 &&
9031 "there must be a single user for the outlined function");
9032 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9033 ToBeDeleted.push_back(StaleCI);
9034
9035 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9036 "Outlined function must have two or three arguments only");
9037
9038 bool HasShared = OutlinedFn.arg_size() == 3;
9039
9040 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9041 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9042 if (HasShared)
9043 OutlinedFn.getArg(2)->setName("data");
9044
9045 // Call to the runtime function for teams in the current function.
9046 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9047 "outlined function.");
9048 Builder.SetInsertPoint(StaleCI);
9049 SmallVector<Value *> Args = {
9050 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9051 if (HasShared)
9052 Args.push_back(StaleCI->getArgOperand(2));
9054 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9055 Args);
9056
9057 for (Instruction *I : llvm::reverse(ToBeDeleted))
9058 I->eraseFromParent();
9059 };
9060
9061 if (!Config.isTargetDevice())
9062 OI.PostOutlineCB = HostPostOutlineCB;
9063
9064 addOutlineInfo(std::move(OI));
9065
9066 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9067
9068 return Builder.saveIP();
9069}
9070
9073 std::string VarName) {
9074 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9076 Names.size()),
9077 Names);
9078 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9079 M, MapNamesArrayInit->getType(),
9080 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9081 VarName);
9082 return MapNamesArrayGlobal;
9083}
9084
9085// Create all simple and struct types exposed by the runtime and remember
9086// the llvm::PointerTypes of them for easy access later.
9087void OpenMPIRBuilder::initializeTypes(Module &M) {
9088 LLVMContext &Ctx = M.getContext();
9089 StructType *T;
9090#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9091#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9092 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9093 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
9094#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9095 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9096 VarName##Ptr = PointerType::getUnqual(VarName);
9097#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9098 T = StructType::getTypeByName(Ctx, StructName); \
9099 if (!T) \
9100 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9101 VarName = T; \
9102 VarName##Ptr = PointerType::getUnqual(T);
9103#include "llvm/Frontend/OpenMP/OMPKinds.def"
9104}
9105
9108 SmallVectorImpl<BasicBlock *> &BlockVector) {
9110 BlockSet.insert(EntryBB);
9111 BlockSet.insert(ExitBB);
9112
9113 Worklist.push_back(EntryBB);
9114 while (!Worklist.empty()) {
9115 BasicBlock *BB = Worklist.pop_back_val();
9116 BlockVector.push_back(BB);
9117 for (BasicBlock *SuccBB : successors(BB))
9118 if (BlockSet.insert(SuccBB).second)
9119 Worklist.push_back(SuccBB);
9120 }
9121}
9122
9124 uint64_t Size, int32_t Flags,
9126 StringRef Name) {
9127 if (!Config.isGPU()) {
9129 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9130 "omp_offloading_entries");
9131 return;
9132 }
9133 // TODO: Add support for global variables on the device after declare target
9134 // support.
9135 Function *Fn = dyn_cast<Function>(Addr);
9136 if (!Fn)
9137 return;
9138
9139 Module &M = *(Fn->getParent());
9140 LLVMContext &Ctx = M.getContext();
9141
9142 // Get "nvvm.annotations" metadata node.
9143 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9144
9145 Metadata *MDVals[] = {
9146 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9147 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9148 // Append metadata to nvvm.annotations.
9149 MD->addOperand(MDNode::get(Ctx, MDVals));
9150
9151 // Add a function attribute for the kernel.
9152 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9153 if (T.isAMDGCN())
9154 Fn->addFnAttr("uniform-work-group-size", "true");
9155 Fn->addFnAttr(Attribute::MustProgress);
9156}
9157
9158// We only generate metadata for function that contain target regions.
9161
9162 // If there are no entries, we don't need to do anything.
9164 return;
9165
9169 16>
9170 OrderedEntries(OffloadInfoManager.size());
9171
9172 // Auxiliary methods to create metadata values and strings.
9173 auto &&GetMDInt = [this](unsigned V) {
9174 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9175 };
9176
9177 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9178
9179 // Create the offloading info metadata node.
9180 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9181 auto &&TargetRegionMetadataEmitter =
9182 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9183 const TargetRegionEntryInfo &EntryInfo,
9185 // Generate metadata for target regions. Each entry of this metadata
9186 // contains:
9187 // - Entry 0 -> Kind of this type of metadata (0).
9188 // - Entry 1 -> Device ID of the file where the entry was identified.
9189 // - Entry 2 -> File ID of the file where the entry was identified.
9190 // - Entry 3 -> Mangled name of the function where the entry was
9191 // identified.
9192 // - Entry 4 -> Line in the file where the entry was identified.
9193 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9194 // - Entry 6 -> Order the entry was created.
9195 // The first element of the metadata node is the kind.
9196 Metadata *Ops[] = {
9197 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9198 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9199 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9200 GetMDInt(E.getOrder())};
9201
9202 // Save this entry in the right position of the ordered entries array.
9203 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9204
9205 // Add metadata to the named metadata node.
9206 MD->addOperand(MDNode::get(C, Ops));
9207 };
9208
9209 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9210
9211 // Create function that emits metadata for each device global variable entry;
9212 auto &&DeviceGlobalVarMetadataEmitter =
9213 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9214 StringRef MangledName,
9216 // Generate metadata for global variables. Each entry of this metadata
9217 // contains:
9218 // - Entry 0 -> Kind of this type of metadata (1).
9219 // - Entry 1 -> Mangled name of the variable.
9220 // - Entry 2 -> Declare target kind.
9221 // - Entry 3 -> Order the entry was created.
9222 // The first element of the metadata node is the kind.
9223 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9224 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9225
9226 // Save this entry in the right position of the ordered entries array.
9227 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9228 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9229
9230 // Add metadata to the named metadata node.
9231 MD->addOperand(MDNode::get(C, Ops));
9232 };
9233
9235 DeviceGlobalVarMetadataEmitter);
9236
9237 for (const auto &E : OrderedEntries) {
9238 assert(E.first && "All ordered entries must exist!");
9239 if (const auto *CE =
9240 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9241 E.first)) {
9242 if (!CE->getID() || !CE->getAddress()) {
9243 // Do not blame the entry if the parent funtion is not emitted.
9244 TargetRegionEntryInfo EntryInfo = E.second;
9245 StringRef FnName = EntryInfo.ParentName;
9246 if (!M.getNamedValue(FnName))
9247 continue;
9248 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9249 continue;
9250 }
9251 createOffloadEntry(CE->getID(), CE->getAddress(),
9252 /*Size=*/0, CE->getFlags(),
9254 } else if (const auto *CE = dyn_cast<
9256 E.first)) {
9259 CE->getFlags());
9260 switch (Flags) {
9264 continue;
9265 if (!CE->getAddress()) {
9266 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9267 continue;
9268 }
9269 // The vaiable has no definition - no need to add the entry.
9270 if (CE->getVarSize() == 0)
9271 continue;
9272 break;
9274 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9275 (!Config.isTargetDevice() && CE->getAddress())) &&
9276 "Declaret target link address is set.");
9277 if (Config.isTargetDevice())
9278 continue;
9279 if (!CE->getAddress()) {
9281 continue;
9282 }
9283 break;
9284 default:
9285 break;
9286 }
9287
9288 // Hidden or internal symbols on the device are not externally visible.
9289 // We should not attempt to register them by creating an offloading
9290 // entry. Indirect variables are handled separately on the device.
9291 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9292 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9294 continue;
9295
9296 // Indirect globals need to use a special name that doesn't match the name
9297 // of the associated host global.
9299 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9300 Flags, CE->getLinkage(), CE->getVarName());
9301 else
9302 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9303 Flags, CE->getLinkage());
9304
9305 } else {
9306 llvm_unreachable("Unsupported entry kind.");
9307 }
9308 }
9309
9310 // Emit requires directive globals to a special entry so the runtime can
9311 // register them when the device image is loaded.
9312 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9313 // entries should be redesigned to better suit this use-case.
9317 /*Name=*/"",
9319 Config.getRequiresFlags(), "omp_offloading_entries");
9320}
9321
9323 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9324 unsigned FileID, unsigned Line, unsigned Count) {
9326 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9327 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9328 if (Count)
9329 OS << "_" << Count;
9330}
9331
9334 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9336 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9337 EntryInfo.Line, NewCount);
9338}
9339
9342 StringRef ParentName) {
9344 auto FileIDInfo = CallBack();
9345 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9346 report_fatal_error(("Unable to get unique ID for file, during "
9347 "getTargetEntryUniqueInfo, error message: " +
9348 EC.message())
9349 .c_str());
9350 }
9351
9352 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9353 std::get<1>(FileIDInfo));
9354}
9355
9357 unsigned Offset = 0;
9358 for (uint64_t Remain =
9359 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9361 !(Remain & 1); Remain = Remain >> 1)
9362 Offset++;
9363 return Offset;
9364}
9365
9368 // Rotate by getFlagMemberOffset() bits.
9369 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9370 << getFlagMemberOffset());
9371}
9372
9375 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9376 // If the entry is PTR_AND_OBJ but has not been marked with the special
9377 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9378 // marked as MEMBER_OF.
9379 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9381 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9384 return;
9385
9386 // Reset the placeholder value to prepare the flag for the assignment of the
9387 // proper MEMBER_OF value.
9388 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9389 Flags |= MemberOfFlag;
9390}
9391
9395 bool IsDeclaration, bool IsExternallyVisible,
9396 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9397 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9398 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9399 std::function<Constant *()> GlobalInitializer,
9400 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9401 // TODO: convert this to utilise the IRBuilder Config rather than
9402 // a passed down argument.
9403 if (OpenMPSIMD)
9404 return nullptr;
9405
9408 CaptureClause ==
9411 SmallString<64> PtrName;
9412 {
9413 raw_svector_ostream OS(PtrName);
9414 OS << MangledName;
9415 if (!IsExternallyVisible)
9416 OS << format("_%x", EntryInfo.FileID);
9417 OS << "_decl_tgt_ref_ptr";
9418 }
9419
9420 Value *Ptr = M.getNamedValue(PtrName);
9421
9422 if (!Ptr) {
9423 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9424 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9425
9426 auto *GV = cast<GlobalVariable>(Ptr);
9427 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9428
9429 if (!Config.isTargetDevice()) {
9430 if (GlobalInitializer)
9431 GV->setInitializer(GlobalInitializer());
9432 else
9433 GV->setInitializer(GlobalValue);
9434 }
9435
9437 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9438 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9439 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9440 }
9441
9442 return cast<Constant>(Ptr);
9443 }
9444
9445 return nullptr;
9446}
9447
9451 bool IsDeclaration, bool IsExternallyVisible,
9452 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9453 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9454 std::vector<Triple> TargetTriple,
9455 std::function<Constant *()> GlobalInitializer,
9456 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9457 Constant *Addr) {
9459 (TargetTriple.empty() && !Config.isTargetDevice()))
9460 return;
9461
9463 StringRef VarName;
9464 int64_t VarSize;
9466
9468 CaptureClause ==
9472 VarName = MangledName;
9473 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9474
9475 if (!IsDeclaration)
9476 VarSize = divideCeil(
9478 else
9479 VarSize = 0;
9480 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9481
9482 // This is a workaround carried over from Clang which prevents undesired
9483 // optimisation of internal variables.
9484 if (Config.isTargetDevice() &&
9485 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9486 // Do not create a "ref-variable" if the original is not also available
9487 // on the host.
9489 return;
9490
9491 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9492
9493 if (!M.getNamedValue(RefName)) {
9494 Constant *AddrRef =
9495 getOrCreateInternalVariable(Addr->getType(), RefName);
9496 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9497 GvAddrRef->setConstant(true);
9498 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9499 GvAddrRef->setInitializer(Addr);
9500 GeneratedRefs.push_back(GvAddrRef);
9501 }
9502 }
9503 } else {
9506 else
9508
9509 if (Config.isTargetDevice()) {
9510 VarName = (Addr) ? Addr->getName() : "";
9511 Addr = nullptr;
9512 } else {
9514 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9515 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9516 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9517 VarName = (Addr) ? Addr->getName() : "";
9518 }
9519 VarSize = M.getDataLayout().getPointerSize();
9521 }
9522
9524 Flags, Linkage);
9525}
9526
9527/// Loads all the offload entries information from the host IR
9528/// metadata.
9530 // If we are in target mode, load the metadata from the host IR. This code has
9531 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9532
9534 if (!MD)
9535 return;
9536
9537 for (MDNode *MN : MD->operands()) {
9538 auto &&GetMDInt = [MN](unsigned Idx) {
9539 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9540 return cast<ConstantInt>(V->getValue())->getZExtValue();
9541 };
9542
9543 auto &&GetMDString = [MN](unsigned Idx) {
9544 auto *V = cast<MDString>(MN->getOperand(Idx));
9545 return V->getString();
9546 };
9547
9548 switch (GetMDInt(0)) {
9549 default:
9550 llvm_unreachable("Unexpected metadata!");
9551 break;
9554 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9555 /*DeviceID=*/GetMDInt(1),
9556 /*FileID=*/GetMDInt(2),
9557 /*Line=*/GetMDInt(4),
9558 /*Count=*/GetMDInt(5));
9560 /*Order=*/GetMDInt(6));
9561 break;
9562 }
9566 /*MangledName=*/GetMDString(1),
9568 /*Flags=*/GetMDInt(2)),
9569 /*Order=*/GetMDInt(3));
9570 break;
9571 }
9572 }
9573}
9574
9576 if (HostFilePath.empty())
9577 return;
9578
9579 auto Buf = MemoryBuffer::getFile(HostFilePath);
9580 if (std::error_code Err = Buf.getError()) {
9581 report_fatal_error(("error opening host file from host file path inside of "
9582 "OpenMPIRBuilder: " +
9583 Err.message())
9584 .c_str());
9585 }
9586
9587 LLVMContext Ctx;
9589 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9590 if (std::error_code Err = M.getError()) {
9592 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9593 .c_str());
9594 }
9595
9596 loadOffloadInfoMetadata(*M.get());
9597}
9598
9599//===----------------------------------------------------------------------===//
9600// OffloadEntriesInfoManager
9601//===----------------------------------------------------------------------===//
9602
9604 return OffloadEntriesTargetRegion.empty() &&
9605 OffloadEntriesDeviceGlobalVar.empty();
9606}
9607
9608unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9609 const TargetRegionEntryInfo &EntryInfo) const {
9610 auto It = OffloadEntriesTargetRegionCount.find(
9611 getTargetRegionEntryCountKey(EntryInfo));
9612 if (It == OffloadEntriesTargetRegionCount.end())
9613 return 0;
9614 return It->second;
9615}
9616
9617void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9618 const TargetRegionEntryInfo &EntryInfo) {
9619 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9620 EntryInfo.Count + 1;
9621}
9622
9623/// Initialize target region entry.
9625 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9626 OffloadEntriesTargetRegion[EntryInfo] =
9627 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9628 OMPTargetRegionEntryTargetRegion);
9629 ++OffloadingEntriesNum;
9630}
9631
9635 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9636
9637 // Update the EntryInfo with the next available count for this location.
9638 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9639
9640 // If we are emitting code for a target, the entry is already initialized,
9641 // only has to be registered.
9642 if (OMPBuilder->Config.isTargetDevice()) {
9643 // This could happen if the device compilation is invoked standalone.
9644 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9645 return;
9646 }
9647 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9648 Entry.setAddress(Addr);
9649 Entry.setID(ID);
9650 Entry.setFlags(Flags);
9651 } else {
9653 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9654 return;
9655 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9656 "Target region entry already registered!");
9657 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9658 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9659 ++OffloadingEntriesNum;
9660 }
9661 incrementTargetRegionEntryInfoCount(EntryInfo);
9662}
9663
9665 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9666
9667 // Update the EntryInfo with the next available count for this location.
9668 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9669
9670 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9671 if (It == OffloadEntriesTargetRegion.end()) {
9672 return false;
9673 }
9674 // Fail if this entry is already registered.
9675 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9676 return false;
9677 return true;
9678}
9679
9681 const OffloadTargetRegionEntryInfoActTy &Action) {
9682 // Scan all target region entries and perform the provided action.
9683 for (const auto &It : OffloadEntriesTargetRegion) {
9684 Action(It.first, It.second);
9685 }
9686}
9687
9689 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9690 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9691 ++OffloadingEntriesNum;
9692}
9693
9695 StringRef VarName, Constant *Addr, int64_t VarSize,
9697 if (OMPBuilder->Config.isTargetDevice()) {
9698 // This could happen if the device compilation is invoked standalone.
9699 if (!hasDeviceGlobalVarEntryInfo(VarName))
9700 return;
9701 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9702 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9703 if (Entry.getVarSize() == 0) {
9704 Entry.setVarSize(VarSize);
9705 Entry.setLinkage(Linkage);
9706 }
9707 return;
9708 }
9709 Entry.setVarSize(VarSize);
9710 Entry.setLinkage(Linkage);
9711 Entry.setAddress(Addr);
9712 } else {
9713 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9714 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9715 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9716 "Entry not initialized!");
9717 if (Entry.getVarSize() == 0) {
9718 Entry.setVarSize(VarSize);
9719 Entry.setLinkage(Linkage);
9720 }
9721 return;
9722 }
9724 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9725 Addr, VarSize, Flags, Linkage,
9726 VarName.str());
9727 else
9728 OffloadEntriesDeviceGlobalVar.try_emplace(
9729 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9730 ++OffloadingEntriesNum;
9731 }
9732}
9733
9736 // Scan all target region entries and perform the provided action.
9737 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9738 Action(E.getKey(), E.getValue());
9739}
9740
9741//===----------------------------------------------------------------------===//
9742// CanonicalLoopInfo
9743//===----------------------------------------------------------------------===//
9744
9745void CanonicalLoopInfo::collectControlBlocks(
9747 // We only count those BBs as control block for which we do not need to
9748 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9749 // flow. For consistency, this also means we do not add the Body block, which
9750 // is just the entry to the body code.
9751 BBs.reserve(BBs.size() + 6);
9752 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9753}
9754
9756 assert(isValid() && "Requires a valid canonical loop");
9757 for (BasicBlock *Pred : predecessors(Header)) {
9758 if (Pred != Latch)
9759 return Pred;
9760 }
9761 llvm_unreachable("Missing preheader");
9762}
9763
9764void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9765 assert(isValid() && "Requires a valid canonical loop");
9766
9767 Instruction *CmpI = &getCond()->front();
9768 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9769 CmpI->setOperand(1, TripCount);
9770
9771#ifndef NDEBUG
9772 assertOK();
9773#endif
9774}
9775
9776void CanonicalLoopInfo::mapIndVar(
9777 llvm::function_ref<Value *(Instruction *)> Updater) {
9778 assert(isValid() && "Requires a valid canonical loop");
9779
9780 Instruction *OldIV = getIndVar();
9781
9782 // Record all uses excluding those introduced by the updater. Uses by the
9783 // CanonicalLoopInfo itself to keep track of the number of iterations are
9784 // excluded.
9785 SmallVector<Use *> ReplacableUses;
9786 for (Use &U : OldIV->uses()) {
9787 auto *User = dyn_cast<Instruction>(U.getUser());
9788 if (!User)
9789 continue;
9790 if (User->getParent() == getCond())
9791 continue;
9792 if (User->getParent() == getLatch())
9793 continue;
9794 ReplacableUses.push_back(&U);
9795 }
9796
9797 // Run the updater that may introduce new uses
9798 Value *NewIV = Updater(OldIV);
9799
9800 // Replace the old uses with the value returned by the updater.
9801 for (Use *U : ReplacableUses)
9802 U->set(NewIV);
9803
9804#ifndef NDEBUG
9805 assertOK();
9806#endif
9807}
9808
9810#ifndef NDEBUG
9811 // No constraints if this object currently does not describe a loop.
9812 if (!isValid())
9813 return;
9814
9815 BasicBlock *Preheader = getPreheader();
9816 BasicBlock *Body = getBody();
9817 BasicBlock *After = getAfter();
9818
9819 // Verify standard control-flow we use for OpenMP loops.
9820 assert(Preheader);
9821 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9822 "Preheader must terminate with unconditional branch");
9823 assert(Preheader->getSingleSuccessor() == Header &&
9824 "Preheader must jump to header");
9825
9826 assert(Header);
9827 assert(isa<BranchInst>(Header->getTerminator()) &&
9828 "Header must terminate with unconditional branch");
9829 assert(Header->getSingleSuccessor() == Cond &&
9830 "Header must jump to exiting block");
9831
9832 assert(Cond);
9833 assert(Cond->getSinglePredecessor() == Header &&
9834 "Exiting block only reachable from header");
9835
9836 assert(isa<BranchInst>(Cond->getTerminator()) &&
9837 "Exiting block must terminate with conditional branch");
9838 assert(size(successors(Cond)) == 2 &&
9839 "Exiting block must have two successors");
9840 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9841 "Exiting block's first successor jump to the body");
9842 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9843 "Exiting block's second successor must exit the loop");
9844
9845 assert(Body);
9846 assert(Body->getSinglePredecessor() == Cond &&
9847 "Body only reachable from exiting block");
9848 assert(!isa<PHINode>(Body->front()));
9849
9850 assert(Latch);
9851 assert(isa<BranchInst>(Latch->getTerminator()) &&
9852 "Latch must terminate with unconditional branch");
9853 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9854 // TODO: To support simple redirecting of the end of the body code that has
9855 // multiple; introduce another auxiliary basic block like preheader and after.
9856 assert(Latch->getSinglePredecessor() != nullptr);
9857 assert(!isa<PHINode>(Latch->front()));
9858
9859 assert(Exit);
9860 assert(isa<BranchInst>(Exit->getTerminator()) &&
9861 "Exit block must terminate with unconditional branch");
9862 assert(Exit->getSingleSuccessor() == After &&
9863 "Exit block must jump to after block");
9864
9865 assert(After);
9866 assert(After->getSinglePredecessor() == Exit &&
9867 "After block only reachable from exit block");
9868 assert(After->empty() || !isa<PHINode>(After->front()));
9869
9870 Instruction *IndVar = getIndVar();
9871 assert(IndVar && "Canonical induction variable not found?");
9872 assert(isa<IntegerType>(IndVar->getType()) &&
9873 "Induction variable must be an integer");
9874 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9875 "Induction variable must be a PHI in the loop header");
9876 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9877 assert(
9878 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9879 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9880
9881 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9882 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9883 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9884 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9885 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9886 ->isOne());
9887
9888 Value *TripCount = getTripCount();
9889 assert(TripCount && "Loop trip count not found?");
9890 assert(IndVar->getType() == TripCount->getType() &&
9891 "Trip count and induction variable must have the same type");
9892
9893 auto *CmpI = cast<CmpInst>(&Cond->front());
9894 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9895 "Exit condition must be a signed less-than comparison");
9896 assert(CmpI->getOperand(0) == IndVar &&
9897 "Exit condition must compare the induction variable");
9898 assert(CmpI->getOperand(1) == TripCount &&
9899 "Exit condition must compare with the trip count");
9900#endif
9901}
9902
9904 Header = nullptr;
9905 Cond = nullptr;
9906 Latch = nullptr;
9907 Exit = nullptr;
9908}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned unsigned DefaultVal
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:599
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:933
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:918
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1921
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2990
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:809
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:853
arg_iterator arg_begin()
Definition: Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:356
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:754
size_t arg_size() const
Definition: Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:289
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:292
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1416
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1065
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1848
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1886
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1780
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2561
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2293
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1814
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2050
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1305
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2198
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2554
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1255
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1979
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:600
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2044
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2146
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1378
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2210
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1420
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1382
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:540
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1873
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1732
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2403
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1186
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2269
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1386
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1163
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1797
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1458
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2032
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1517
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1133
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1920
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1966
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1810
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1369
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2141
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1429
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2587
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2448
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1861
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2018
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1539
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1157
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2301
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:500
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2281
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2224
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2582
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:583
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1833
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1498
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1561
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2379
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1446
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2065
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2156
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1553
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1733
iterator_range< op_iterator > operands()
Definition: Metadata.h:1829
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr)
Generator for #omp task
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:990
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1048
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1058
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:128
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:144
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:77
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:756
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61