LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 uint32_t SrcLocStrSize,
835 IdentFlag LocFlags,
836 unsigned Reserve2Flags) {
837 // Enable "C-mode".
838 LocFlags |= OMP_IDENT_FLAG_KMPC;
839
840 Constant *&Ident =
841 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
842 if (!Ident) {
844 Constant *IdentData[] = {I32Null,
845 ConstantInt::get(Int32, uint32_t(LocFlags)),
846 ConstantInt::get(Int32, Reserve2Flags),
847 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
848 Constant *Initializer =
849 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
850
851 // Look for existing encoding of the location + flags, not needed but
852 // minimizes the difference to the existing solution while we transition.
853 for (GlobalVariable &GV : M.globals())
854 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
855 if (GV.getInitializer() == Initializer)
856 Ident = &GV;
857
858 if (!Ident) {
859 auto *GV = new GlobalVariable(
860 M, OpenMPIRBuilder::Ident,
861 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
864 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
865 GV->setAlignment(Align(8));
866 Ident = GV;
867 }
868 }
869
871}
872
874 uint32_t &SrcLocStrSize) {
875 SrcLocStrSize = LocStr.size();
876 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
877 if (!SrcLocStr) {
878 Constant *Initializer =
880
881 // Look for existing encoding of the location, not needed but minimizes the
882 // difference to the existing solution while we transition.
883 for (GlobalVariable &GV : M.globals())
884 if (GV.isConstant() && GV.hasInitializer() &&
885 GV.getInitializer() == Initializer)
886 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
887
888 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
889 /* AddressSpace */ 0, &M);
890 }
891 return SrcLocStr;
892}
893
895 StringRef FileName,
896 unsigned Line, unsigned Column,
897 uint32_t &SrcLocStrSize) {
898 SmallString<128> Buffer;
899 Buffer.push_back(';');
900 Buffer.append(FileName);
901 Buffer.push_back(';');
902 Buffer.append(FunctionName);
903 Buffer.push_back(';');
904 Buffer.append(std::to_string(Line));
905 Buffer.push_back(';');
906 Buffer.append(std::to_string(Column));
907 Buffer.push_back(';');
908 Buffer.push_back(';');
909 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
910}
911
912Constant *
914 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
915 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
916}
917
919 uint32_t &SrcLocStrSize,
920 Function *F) {
921 DILocation *DIL = DL.get();
922 if (!DIL)
923 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
924 StringRef FileName = M.getName();
925 if (DIFile *DIF = DIL->getFile())
926 if (std::optional<StringRef> Source = DIF->getSource())
927 FileName = *Source;
928 StringRef Function = DIL->getScope()->getSubprogram()->getName();
929 if (Function.empty() && F)
930 Function = F->getName();
931 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
932 DIL->getColumn(), SrcLocStrSize);
933}
934
936 uint32_t &SrcLocStrSize) {
937 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
938 Loc.IP.getBlock()->getParent());
939}
940
942 return Builder.CreateCall(
943 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
944 "omp_global_thread_num");
945}
946
949 bool ForceSimpleCall, bool CheckCancelFlag) {
950 if (!updateToLocation(Loc))
951 return Loc.IP;
952
953 // Build call __kmpc_cancel_barrier(loc, thread_id) or
954 // __kmpc_barrier(loc, thread_id);
955
956 IdentFlag BarrierLocFlags;
957 switch (Kind) {
958 case OMPD_for:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
960 break;
961 case OMPD_sections:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
963 break;
964 case OMPD_single:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
966 break;
967 case OMPD_barrier:
968 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
969 break;
970 default:
971 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
972 break;
973 }
974
975 uint32_t SrcLocStrSize;
976 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
977 Value *Args[] = {
978 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
979 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
980
981 // If we are in a cancellable parallel region, barriers are cancellation
982 // points.
983 // TODO: Check why we would force simple calls or to ignore the cancel flag.
984 bool UseCancelBarrier =
985 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
986
987 Value *Result =
989 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
990 : OMPRTL___kmpc_barrier),
991 Args);
992
993 if (UseCancelBarrier && CheckCancelFlag)
994 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
995 return Err;
996
997 return Builder.saveIP();
998}
999
1002 Value *IfCondition,
1003 omp::Directive CanceledDirective) {
1004 if (!updateToLocation(Loc))
1005 return Loc.IP;
1006
1007 // LLVM utilities like blocks with terminators.
1008 auto *UI = Builder.CreateUnreachable();
1009
1010 Instruction *ThenTI = UI, *ElseTI = nullptr;
1011 if (IfCondition)
1012 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1013 Builder.SetInsertPoint(ThenTI);
1014
1015 Value *CancelKind = nullptr;
1016 switch (CanceledDirective) {
1017#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1018 case DirectiveEnum: \
1019 CancelKind = Builder.getInt32(Value); \
1020 break;
1021#include "llvm/Frontend/OpenMP/OMPKinds.def"
1022 default:
1023 llvm_unreachable("Unknown cancel kind!");
1024 }
1025
1026 uint32_t SrcLocStrSize;
1027 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1028 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1029 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1030 Value *Result = Builder.CreateCall(
1031 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1032 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1033 if (CanceledDirective == OMPD_parallel) {
1035 Builder.restoreIP(IP);
1037 omp::Directive::OMPD_unknown,
1038 /* ForceSimpleCall */ false,
1039 /* CheckCancelFlag */ false)
1040 .takeError();
1041 }
1042 return Error::success();
1043 };
1044
1045 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1046 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1047 return Err;
1048
1049 // Update the insertion point and remove the terminator we introduced.
1050 Builder.SetInsertPoint(UI->getParent());
1051 UI->eraseFromParent();
1052
1053 return Builder.saveIP();
1054}
1055
1057 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1058 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1059 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1060 if (!updateToLocation(Loc))
1061 return Loc.IP;
1062
1063 Builder.restoreIP(AllocaIP);
1064 auto *KernelArgsPtr =
1065 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1066 Builder.restoreIP(Loc.IP);
1067
1068 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1069 llvm::Value *Arg =
1070 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1072 KernelArgs[I], Arg,
1073 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1074 }
1075
1076 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1077 NumThreads, HostPtr, KernelArgsPtr};
1078
1079 Return = Builder.CreateCall(
1080 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1081 OffloadingArgs);
1082
1083 return Builder.saveIP();
1084}
1085
1087 const LocationDescription &Loc, Value *OutlinedFnID,
1088 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1089 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1090
1091 if (!updateToLocation(Loc))
1092 return Loc.IP;
1093
1094 Builder.restoreIP(Loc.IP);
1095 // On top of the arrays that were filled up, the target offloading call
1096 // takes as arguments the device id as well as the host pointer. The host
1097 // pointer is used by the runtime library to identify the current target
1098 // region, so it only has to be unique and not necessarily point to
1099 // anything. It could be the pointer to the outlined function that
1100 // implements the target region, but we aren't using that so that the
1101 // compiler doesn't need to keep that, and could therefore inline the host
1102 // function if proven worthwhile during optimization.
1103
1104 // From this point on, we need to have an ID of the target region defined.
1105 assert(OutlinedFnID && "Invalid outlined function ID!");
1106 (void)OutlinedFnID;
1107
1108 // Return value of the runtime offloading call.
1109 Value *Return = nullptr;
1110
1111 // Arguments for the target kernel.
1112 SmallVector<Value *> ArgsVector;
1113 getKernelArgsVector(Args, Builder, ArgsVector);
1114
1115 // The target region is an outlined function launched by the runtime
1116 // via calls to __tgt_target_kernel().
1117 //
1118 // Note that on the host and CPU targets, the runtime implementation of
1119 // these calls simply call the outlined function without forking threads.
1120 // The outlined functions themselves have runtime calls to
1121 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1122 // the compiler in emitTeamsCall() and emitParallelCall().
1123 //
1124 // In contrast, on the NVPTX target, the implementation of
1125 // __tgt_target_teams() launches a GPU kernel with the requested number
1126 // of teams and threads so no additional calls to the runtime are required.
1127 // Check the error code and execute the host version if required.
1129 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1130 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1131
1132 BasicBlock *OffloadFailedBlock =
1133 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1134 BasicBlock *OffloadContBlock =
1135 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1137 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1138
1139 auto CurFn = Builder.GetInsertBlock()->getParent();
1140 emitBlock(OffloadFailedBlock, CurFn);
1141 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1142 if (!AfterIP)
1143 return AfterIP.takeError();
1144 Builder.restoreIP(*AfterIP);
1145 emitBranch(OffloadContBlock);
1146 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1147 return Builder.saveIP();
1148}
1149
1151 Value *CancelFlag, omp::Directive CanceledDirective,
1152 FinalizeCallbackTy ExitCB) {
1153 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1154 "Unexpected cancellation!");
1155
1156 // For a cancel barrier we create two new blocks.
1158 BasicBlock *NonCancellationBlock;
1159 if (Builder.GetInsertPoint() == BB->end()) {
1160 // TODO: This branch will not be needed once we moved to the
1161 // OpenMPIRBuilder codegen completely.
1162 NonCancellationBlock = BasicBlock::Create(
1163 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1164 } else {
1165 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1168 }
1169 BasicBlock *CancellationBlock = BasicBlock::Create(
1170 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1171
1172 // Jump to them based on the return value.
1173 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1174 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1175 /* TODO weight */ nullptr, nullptr);
1176
1177 // From the cancellation block we finalize all variables and go to the
1178 // post finalization block that is known to the FiniCB callback.
1179 Builder.SetInsertPoint(CancellationBlock);
1180 if (ExitCB)
1181 if (Error Err = ExitCB(Builder.saveIP()))
1182 return Err;
1183 auto &FI = FinalizationStack.back();
1184 if (Error Err = FI.FiniCB(Builder.saveIP()))
1185 return Err;
1186
1187 // The continuation block is where code generation continues.
1188 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1189 return Error::success();
1190}
1191
1192// Callback used to create OpenMP runtime calls to support
1193// omp parallel clause for the device.
1194// We need to use this callback to replace call to the OutlinedFn in OuterFn
1195// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1197 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1198 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1199 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1200 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1201 // Add some known attributes.
1202 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1203 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1204 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1205 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1206 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1207 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1208
1209 assert(OutlinedFn.arg_size() >= 2 &&
1210 "Expected at least tid and bounded tid as arguments");
1211 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1212
1213 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1214 assert(CI && "Expected call instruction to outlined function");
1215 CI->getParent()->setName("omp_parallel");
1216
1217 Builder.SetInsertPoint(CI);
1218 Type *PtrTy = OMPIRBuilder->VoidPtr;
1219 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1220
1221 // Add alloca for kernel args
1222 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1223 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1224 AllocaInst *ArgsAlloca =
1225 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1226 Value *Args = ArgsAlloca;
1227 // Add address space cast if array for storing arguments is not allocated
1228 // in address space 0
1229 if (ArgsAlloca->getAddressSpace())
1230 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1231 Builder.restoreIP(CurrentIP);
1232
1233 // Store captured vars which are used by kmpc_parallel_51
1234 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1235 Value *V = *(CI->arg_begin() + 2 + Idx);
1236 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1237 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1238 Builder.CreateStore(V, StoreAddress);
1239 }
1240
1241 Value *Cond =
1242 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1243 : Builder.getInt32(1);
1244
1245 // Build kmpc_parallel_51 call
1246 Value *Parallel51CallArgs[] = {
1247 /* identifier*/ Ident,
1248 /* global thread num*/ ThreadID,
1249 /* if expression */ Cond,
1250 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1251 /* Proc bind */ Builder.getInt32(-1),
1252 /* outlined function */
1253 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1254 /* wrapper function */ NullPtrValue,
1255 /* arguments of the outlined funciton*/ Args,
1256 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1257
1258 FunctionCallee RTLFn =
1259 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1260
1261 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1262
1263 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1264 << *Builder.GetInsertBlock()->getParent() << "\n");
1265
1266 // Initialize the local TID stack location with the argument value.
1267 Builder.SetInsertPoint(PrivTID);
1268 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1269 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1270 PrivTIDAddr);
1271
1272 // Remove redundant call to the outlined function.
1273 CI->eraseFromParent();
1274
1275 for (Instruction *I : ToBeDeleted) {
1276 I->eraseFromParent();
1277 }
1278}
1279
1280// Callback used to create OpenMP runtime calls to support
1281// omp parallel clause for the host.
1282// We need to use this callback to replace call to the OutlinedFn in OuterFn
1283// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1284static void
1286 Function *OuterFn, Value *Ident, Value *IfCondition,
1287 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1288 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1289 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1290 FunctionCallee RTLFn;
1291 if (IfCondition) {
1292 RTLFn =
1293 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1294 } else {
1295 RTLFn =
1296 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1297 }
1298 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1299 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1300 LLVMContext &Ctx = F->getContext();
1301 MDBuilder MDB(Ctx);
1302 // Annotate the callback behavior of the __kmpc_fork_call:
1303 // - The callback callee is argument number 2 (microtask).
1304 // - The first two arguments of the callback callee are unknown (-1).
1305 // - All variadic arguments to the __kmpc_fork_call are passed to the
1306 // callback callee.
1307 F->addMetadata(LLVMContext::MD_callback,
1309 2, {-1, -1},
1310 /* VarArgsArePassed */ true)}));
1311 }
1312 }
1313 // Add some known attributes.
1314 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1315 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1316 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1317
1318 assert(OutlinedFn.arg_size() >= 2 &&
1319 "Expected at least tid and bounded tid as arguments");
1320 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1321
1322 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1323 CI->getParent()->setName("omp_parallel");
1324 Builder.SetInsertPoint(CI);
1325
1326 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1327 Value *ForkCallArgs[] = {
1328 Ident, Builder.getInt32(NumCapturedVars),
1329 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1330
1331 SmallVector<Value *, 16> RealArgs;
1332 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1333 if (IfCondition) {
1334 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1335 RealArgs.push_back(Cond);
1336 }
1337 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1338
1339 // __kmpc_fork_call_if always expects a void ptr as the last argument
1340 // If there are no arguments, pass a null pointer.
1341 auto PtrTy = OMPIRBuilder->VoidPtr;
1342 if (IfCondition && NumCapturedVars == 0) {
1343 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1344 RealArgs.push_back(NullPtrValue);
1345 }
1346 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1347 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1348
1349 Builder.CreateCall(RTLFn, RealArgs);
1350
1351 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1352 << *Builder.GetInsertBlock()->getParent() << "\n");
1353
1354 // Initialize the local TID stack location with the argument value.
1355 Builder.SetInsertPoint(PrivTID);
1356 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1357 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1358 PrivTIDAddr);
1359
1360 // Remove redundant call to the outlined function.
1361 CI->eraseFromParent();
1362
1363 for (Instruction *I : ToBeDeleted) {
1364 I->eraseFromParent();
1365 }
1366}
1367
1369 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1370 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1371 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1372 omp::ProcBindKind ProcBind, bool IsCancellable) {
1373 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1374
1375 if (!updateToLocation(Loc))
1376 return Loc.IP;
1377
1378 uint32_t SrcLocStrSize;
1379 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1380 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1381 Value *ThreadID = getOrCreateThreadID(Ident);
1382 // If we generate code for the target device, we need to allocate
1383 // struct for aggregate params in the device default alloca address space.
1384 // OpenMP runtime requires that the params of the extracted functions are
1385 // passed as zero address space pointers. This flag ensures that extracted
1386 // function arguments are declared in zero address space
1387 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1388
1389 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1390 // only if we compile for host side.
1391 if (NumThreads && !Config.isTargetDevice()) {
1392 Value *Args[] = {
1393 Ident, ThreadID,
1394 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1396 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1397 }
1398
1399 if (ProcBind != OMP_PROC_BIND_default) {
1400 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1401 Value *Args[] = {
1402 Ident, ThreadID,
1403 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1405 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1406 }
1407
1408 BasicBlock *InsertBB = Builder.GetInsertBlock();
1409 Function *OuterFn = InsertBB->getParent();
1410
1411 // Save the outer alloca block because the insertion iterator may get
1412 // invalidated and we still need this later.
1413 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1414
1415 // Vector to remember instructions we used only during the modeling but which
1416 // we want to delete at the end.
1418
1419 // Change the location to the outer alloca insertion point to create and
1420 // initialize the allocas we pass into the parallel region.
1421 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1422 Builder.restoreIP(NewOuter);
1423 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1424 AllocaInst *ZeroAddrAlloca =
1425 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1426 Instruction *TIDAddr = TIDAddrAlloca;
1427 Instruction *ZeroAddr = ZeroAddrAlloca;
1428 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1429 // Add additional casts to enforce pointers in zero address space
1430 TIDAddr = new AddrSpaceCastInst(
1431 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1432 TIDAddr->insertAfter(TIDAddrAlloca);
1433 ToBeDeleted.push_back(TIDAddr);
1434 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1435 PointerType ::get(M.getContext(), 0),
1436 "zero.addr.ascast");
1437 ZeroAddr->insertAfter(ZeroAddrAlloca);
1438 ToBeDeleted.push_back(ZeroAddr);
1439 }
1440
1441 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1442 // associated arguments in the outlined function, so we delete them later.
1443 ToBeDeleted.push_back(TIDAddrAlloca);
1444 ToBeDeleted.push_back(ZeroAddrAlloca);
1445
1446 // Create an artificial insertion point that will also ensure the blocks we
1447 // are about to split are not degenerated.
1448 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1449
1450 BasicBlock *EntryBB = UI->getParent();
1451 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1452 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1453 BasicBlock *PRegPreFiniBB =
1454 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1455 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1456
1457 auto FiniCBWrapper = [&](InsertPointTy IP) {
1458 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1459 // target to the region exit block.
1460 if (IP.getBlock()->end() == IP.getPoint()) {
1462 Builder.restoreIP(IP);
1463 Instruction *I = Builder.CreateBr(PRegExitBB);
1464 IP = InsertPointTy(I->getParent(), I->getIterator());
1465 }
1466 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1467 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1468 "Unexpected insertion point for finalization call!");
1469 return FiniCB(IP);
1470 };
1471
1472 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1473
1474 // Generate the privatization allocas in the block that will become the entry
1475 // of the outlined function.
1476 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1477 InsertPointTy InnerAllocaIP = Builder.saveIP();
1478
1479 AllocaInst *PrivTIDAddr =
1480 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1481 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1482
1483 // Add some fake uses for OpenMP provided arguments.
1484 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1485 Instruction *ZeroAddrUse =
1486 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1487 ToBeDeleted.push_back(ZeroAddrUse);
1488
1489 // EntryBB
1490 // |
1491 // V
1492 // PRegionEntryBB <- Privatization allocas are placed here.
1493 // |
1494 // V
1495 // PRegionBodyBB <- BodeGen is invoked here.
1496 // |
1497 // V
1498 // PRegPreFiniBB <- The block we will start finalization from.
1499 // |
1500 // V
1501 // PRegionExitBB <- A common exit to simplify block collection.
1502 //
1503
1504 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1505
1506 // Let the caller create the body.
1507 assert(BodyGenCB && "Expected body generation callback!");
1508 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1509 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1510 return Err;
1511
1512 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1513
1514 OutlineInfo OI;
1515 if (Config.isTargetDevice()) {
1516 // Generate OpenMP target specific runtime call
1517 OI.PostOutlineCB = [=, ToBeDeletedVec =
1518 std::move(ToBeDeleted)](Function &OutlinedFn) {
1519 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1520 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1521 ThreadID, ToBeDeletedVec);
1522 };
1523 } else {
1524 // Generate OpenMP host runtime call
1525 OI.PostOutlineCB = [=, ToBeDeletedVec =
1526 std::move(ToBeDeleted)](Function &OutlinedFn) {
1527 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1528 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1529 };
1530 }
1531
1532 OI.OuterAllocaBB = OuterAllocaBlock;
1533 OI.EntryBB = PRegEntryBB;
1534 OI.ExitBB = PRegExitBB;
1535
1536 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1538 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1539
1540 // Ensure a single exit node for the outlined region by creating one.
1541 // We might have multiple incoming edges to the exit now due to finalizations,
1542 // e.g., cancel calls that cause the control flow to leave the region.
1543 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1544 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1545 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1546 Blocks.push_back(PRegOutlinedExitBB);
1547
1548 CodeExtractorAnalysisCache CEAC(*OuterFn);
1549 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1550 /* AggregateArgs */ false,
1551 /* BlockFrequencyInfo */ nullptr,
1552 /* BranchProbabilityInfo */ nullptr,
1553 /* AssumptionCache */ nullptr,
1554 /* AllowVarArgs */ true,
1555 /* AllowAlloca */ true,
1556 /* AllocationBlock */ OuterAllocaBlock,
1557 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1558
1559 // Find inputs to, outputs from the code region.
1560 BasicBlock *CommonExit = nullptr;
1561 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1562 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1563
1564 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1565 /*CollectGlobalInputs=*/true);
1566
1567 Inputs.remove_if([&](Value *I) {
1568 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1569 return GV->getValueType() == OpenMPIRBuilder::Ident;
1570
1571 return false;
1572 });
1573
1574 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1575
1576 FunctionCallee TIDRTLFn =
1577 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1578
1579 auto PrivHelper = [&](Value &V) -> Error {
1580 if (&V == TIDAddr || &V == ZeroAddr) {
1581 OI.ExcludeArgsFromAggregate.push_back(&V);
1582 return Error::success();
1583 }
1584
1586 for (Use &U : V.uses())
1587 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1588 if (ParallelRegionBlockSet.count(UserI->getParent()))
1589 Uses.insert(&U);
1590
1591 // __kmpc_fork_call expects extra arguments as pointers. If the input
1592 // already has a pointer type, everything is fine. Otherwise, store the
1593 // value onto stack and load it back inside the to-be-outlined region. This
1594 // will ensure only the pointer will be passed to the function.
1595 // FIXME: if there are more than 15 trailing arguments, they must be
1596 // additionally packed in a struct.
1597 Value *Inner = &V;
1598 if (!V.getType()->isPointerTy()) {
1600 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1601
1602 Builder.restoreIP(OuterAllocaIP);
1603 Value *Ptr =
1604 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1605
1606 // Store to stack at end of the block that currently branches to the entry
1607 // block of the to-be-outlined region.
1608 Builder.SetInsertPoint(InsertBB,
1609 InsertBB->getTerminator()->getIterator());
1610 Builder.CreateStore(&V, Ptr);
1611
1612 // Load back next to allocations in the to-be-outlined region.
1613 Builder.restoreIP(InnerAllocaIP);
1614 Inner = Builder.CreateLoad(V.getType(), Ptr);
1615 }
1616
1617 Value *ReplacementValue = nullptr;
1618 CallInst *CI = dyn_cast<CallInst>(&V);
1619 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1620 ReplacementValue = PrivTID;
1621 } else {
1622 InsertPointOrErrorTy AfterIP =
1623 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1624 if (!AfterIP)
1625 return AfterIP.takeError();
1626 Builder.restoreIP(*AfterIP);
1627 InnerAllocaIP = {
1628 InnerAllocaIP.getBlock(),
1629 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1630
1631 assert(ReplacementValue &&
1632 "Expected copy/create callback to set replacement value!");
1633 if (ReplacementValue == &V)
1634 return Error::success();
1635 }
1636
1637 for (Use *UPtr : Uses)
1638 UPtr->set(ReplacementValue);
1639
1640 return Error::success();
1641 };
1642
1643 // Reset the inner alloca insertion as it will be used for loading the values
1644 // wrapped into pointers before passing them into the to-be-outlined region.
1645 // Configure it to insert immediately after the fake use of zero address so
1646 // that they are available in the generated body and so that the
1647 // OpenMP-related values (thread ID and zero address pointers) remain leading
1648 // in the argument list.
1649 InnerAllocaIP = IRBuilder<>::InsertPoint(
1650 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1651
1652 // Reset the outer alloca insertion point to the entry of the relevant block
1653 // in case it was invalidated.
1654 OuterAllocaIP = IRBuilder<>::InsertPoint(
1655 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1656
1657 for (Value *Input : Inputs) {
1658 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1659 if (Error Err = PrivHelper(*Input))
1660 return Err;
1661 }
1662 LLVM_DEBUG({
1663 for (Value *Output : Outputs)
1664 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1665 });
1666 assert(Outputs.empty() &&
1667 "OpenMP outlining should not produce live-out values!");
1668
1669 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1670 LLVM_DEBUG({
1671 for (auto *BB : Blocks)
1672 dbgs() << " PBR: " << BB->getName() << "\n";
1673 });
1674
1675 // Adjust the finalization stack, verify the adjustment, and call the
1676 // finalize function a last time to finalize values between the pre-fini
1677 // block and the exit block if we left the parallel "the normal way".
1678 auto FiniInfo = FinalizationStack.pop_back_val();
1679 (void)FiniInfo;
1680 assert(FiniInfo.DK == OMPD_parallel &&
1681 "Unexpected finalization stack state!");
1682
1683 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1684
1685 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1686 if (Error Err = FiniCB(PreFiniIP))
1687 return Err;
1688
1689 // Register the outlined info.
1690 addOutlineInfo(std::move(OI));
1691
1692 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1693 UI->eraseFromParent();
1694
1695 return AfterIP;
1696}
1697
1699 // Build call void __kmpc_flush(ident_t *loc)
1700 uint32_t SrcLocStrSize;
1701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1702 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1703
1704 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1705}
1706
1708 if (!updateToLocation(Loc))
1709 return;
1710 emitFlush(Loc);
1711}
1712
1714 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1715 // global_tid);
1716 uint32_t SrcLocStrSize;
1717 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1718 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1719 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1720
1721 // Ignore return result until untied tasks are supported.
1722 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1723 Args);
1724}
1725
1727 if (!updateToLocation(Loc))
1728 return;
1729 emitTaskwaitImpl(Loc);
1730}
1731
1733 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1734 uint32_t SrcLocStrSize;
1735 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1736 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1738 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1739
1740 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1741 Args);
1742}
1743
1745 if (!updateToLocation(Loc))
1746 return;
1747 emitTaskyieldImpl(Loc);
1748}
1749
1750// Processes the dependencies in Dependencies and does the following
1751// - Allocates space on the stack of an array of DependInfo objects
1752// - Populates each DependInfo object with relevant information of
1753// the corresponding dependence.
1754// - All code is inserted in the entry block of the current function.
1756 OpenMPIRBuilder &OMPBuilder,
1758 // Early return if we have no dependencies to process
1759 if (Dependencies.empty())
1760 return nullptr;
1761
1762 // Given a vector of DependData objects, in this function we create an
1763 // array on the stack that holds kmp_dep_info objects corresponding
1764 // to each dependency. This is then passed to the OpenMP runtime.
1765 // For example, if there are 'n' dependencies then the following psedo
1766 // code is generated. Assume the first dependence is on a variable 'a'
1767 //
1768 // \code{c}
1769 // DepArray = alloc(n x sizeof(kmp_depend_info);
1770 // idx = 0;
1771 // DepArray[idx].base_addr = ptrtoint(&a);
1772 // DepArray[idx].len = 8;
1773 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1774 // ++idx;
1775 // DepArray[idx].base_addr = ...;
1776 // \endcode
1777
1778 IRBuilderBase &Builder = OMPBuilder.Builder;
1779 Type *DependInfo = OMPBuilder.DependInfo;
1780 Module &M = OMPBuilder.M;
1781
1782 Value *DepArray = nullptr;
1783 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1784 Builder.SetInsertPoint(
1786
1787 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1788 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1789
1790 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1791 Value *Base =
1792 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1793 // Store the pointer to the variable
1794 Value *Addr = Builder.CreateStructGEP(
1795 DependInfo, Base,
1796 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1797 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1798 Builder.CreateStore(DepValPtr, Addr);
1799 // Store the size of the variable
1800 Value *Size = Builder.CreateStructGEP(
1801 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1802 Builder.CreateStore(
1803 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1804 Size);
1805 // Store the dependency kind
1806 Value *Flags = Builder.CreateStructGEP(
1807 DependInfo, Base,
1808 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1809 Builder.CreateStore(
1810 ConstantInt::get(Builder.getInt8Ty(),
1811 static_cast<unsigned int>(Dep.DepKind)),
1812 Flags);
1813 }
1814 Builder.restoreIP(OldIP);
1815 return DepArray;
1816}
1817
1819 const LocationDescription &Loc, InsertPointTy AllocaIP,
1820 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1821 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle) {
1822
1823 if (!updateToLocation(Loc))
1824 return InsertPointTy();
1825
1826 uint32_t SrcLocStrSize;
1827 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1828 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1829 // The current basic block is split into four basic blocks. After outlining,
1830 // they will be mapped as follows:
1831 // ```
1832 // def current_fn() {
1833 // current_basic_block:
1834 // br label %task.exit
1835 // task.exit:
1836 // ; instructions after task
1837 // }
1838 // def outlined_fn() {
1839 // task.alloca:
1840 // br label %task.body
1841 // task.body:
1842 // ret void
1843 // }
1844 // ```
1845 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1846 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1847 BasicBlock *TaskAllocaBB =
1848 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1849
1850 InsertPointTy TaskAllocaIP =
1851 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1852 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1853 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1854 return Err;
1855
1856 OutlineInfo OI;
1857 OI.EntryBB = TaskAllocaBB;
1858 OI.OuterAllocaBB = AllocaIP.getBlock();
1859 OI.ExitBB = TaskExitBB;
1860
1861 // Add the thread ID argument.
1864 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1865
1866 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1867 Mergeable, EventHandle, TaskAllocaBB,
1868 ToBeDeleted](Function &OutlinedFn) mutable {
1869 // Replace the Stale CI by appropriate RTL function call.
1870 assert(OutlinedFn.getNumUses() == 1 &&
1871 "there must be a single user for the outlined function");
1872 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1873
1874 // HasShareds is true if any variables are captured in the outlined region,
1875 // false otherwise.
1876 bool HasShareds = StaleCI->arg_size() > 1;
1877 Builder.SetInsertPoint(StaleCI);
1878
1879 // Gather the arguments for emitting the runtime call for
1880 // @__kmpc_omp_task_alloc
1881 Function *TaskAllocFn =
1882 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1883
1884 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1885 // call.
1886 Value *ThreadID = getOrCreateThreadID(Ident);
1887
1888 // Argument - `flags`
1889 // Task is tied iff (Flags & 1) == 1.
1890 // Task is untied iff (Flags & 1) == 0.
1891 // Task is final iff (Flags & 2) == 2.
1892 // Task is not final iff (Flags & 2) == 0.
1893 // Task is mergeable iff (Flags & 4) == 4.
1894 // Task is not mergeable iff (Flags & 4) == 0.
1895 // TODO: Handle the other flags.
1896 Value *Flags = Builder.getInt32(Tied);
1897 if (Final) {
1898 Value *FinalFlag =
1900 Flags = Builder.CreateOr(FinalFlag, Flags);
1901 }
1902
1903 if (Mergeable)
1905
1906 // Argument - `sizeof_kmp_task_t` (TaskSize)
1907 // Tasksize refers to the size in bytes of kmp_task_t data structure
1908 // including private vars accessed in task.
1909 // TODO: add kmp_task_t_with_privates (privates)
1910 Value *TaskSize = Builder.getInt64(
1912
1913 // Argument - `sizeof_shareds` (SharedsSize)
1914 // SharedsSize refers to the shareds array size in the kmp_task_t data
1915 // structure.
1916 Value *SharedsSize = Builder.getInt64(0);
1917 if (HasShareds) {
1918 AllocaInst *ArgStructAlloca =
1919 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1920 assert(ArgStructAlloca &&
1921 "Unable to find the alloca instruction corresponding to arguments "
1922 "for extracted function");
1923 StructType *ArgStructType =
1924 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1925 assert(ArgStructType && "Unable to find struct type corresponding to "
1926 "arguments for extracted function");
1927 SharedsSize =
1929 }
1930 // Emit the @__kmpc_omp_task_alloc runtime call
1931 // The runtime call returns a pointer to an area where the task captured
1932 // variables must be copied before the task is run (TaskData)
1933 CallInst *TaskData = Builder.CreateCall(
1934 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1935 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1936 /*task_func=*/&OutlinedFn});
1937
1938 // Emit detach clause initialization.
1939 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
1940 // task_descriptor);
1941 if (EventHandle) {
1943 OMPRTL___kmpc_task_allow_completion_event);
1944 llvm::Value *EventVal =
1945 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
1946 llvm::Value *EventHandleAddr =
1948 Builder.getPtrTy(0));
1949 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
1950 Builder.CreateStore(EventVal, EventHandleAddr);
1951 }
1952 // Copy the arguments for outlined function
1953 if (HasShareds) {
1954 Value *Shareds = StaleCI->getArgOperand(1);
1955 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1956 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1957 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1958 SharedsSize);
1959 }
1960
1961 Value *DepArray = nullptr;
1962 if (Dependencies.size()) {
1963 InsertPointTy OldIP = Builder.saveIP();
1965 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1966
1967 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1968 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1969
1970 unsigned P = 0;
1971 for (const DependData &Dep : Dependencies) {
1972 Value *Base =
1973 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1974 // Store the pointer to the variable
1976 DependInfo, Base,
1977 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1978 Value *DepValPtr =
1980 Builder.CreateStore(DepValPtr, Addr);
1981 // Store the size of the variable
1983 DependInfo, Base,
1984 static_cast<unsigned int>(RTLDependInfoFields::Len));
1986 Dep.DepValueType)),
1987 Size);
1988 // Store the dependency kind
1990 DependInfo, Base,
1991 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1993 ConstantInt::get(Builder.getInt8Ty(),
1994 static_cast<unsigned int>(Dep.DepKind)),
1995 Flags);
1996 ++P;
1997 }
1998
1999 Builder.restoreIP(OldIP);
2000 }
2001
2002 // In the presence of the `if` clause, the following IR is generated:
2003 // ...
2004 // %data = call @__kmpc_omp_task_alloc(...)
2005 // br i1 %if_condition, label %then, label %else
2006 // then:
2007 // call @__kmpc_omp_task(...)
2008 // br label %exit
2009 // else:
2010 // ;; Wait for resolution of dependencies, if any, before
2011 // ;; beginning the task
2012 // call @__kmpc_omp_wait_deps(...)
2013 // call @__kmpc_omp_task_begin_if0(...)
2014 // call @outlined_fn(...)
2015 // call @__kmpc_omp_task_complete_if0(...)
2016 // br label %exit
2017 // exit:
2018 // ...
2019 if (IfCondition) {
2020 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2021 // terminator.
2022 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2023 Instruction *IfTerminator =
2024 Builder.GetInsertPoint()->getParent()->getTerminator();
2025 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2026 Builder.SetInsertPoint(IfTerminator);
2027 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2028 &ElseTI);
2029 Builder.SetInsertPoint(ElseTI);
2030
2031 if (Dependencies.size()) {
2032 Function *TaskWaitFn =
2033 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2035 TaskWaitFn,
2036 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2037 ConstantInt::get(Builder.getInt32Ty(), 0),
2039 }
2040 Function *TaskBeginFn =
2041 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2042 Function *TaskCompleteFn =
2043 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2044 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2045 CallInst *CI = nullptr;
2046 if (HasShareds)
2047 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2048 else
2049 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2050 CI->setDebugLoc(StaleCI->getDebugLoc());
2051 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2052 Builder.SetInsertPoint(ThenTI);
2053 }
2054
2055 if (Dependencies.size()) {
2056 Function *TaskFn =
2057 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2059 TaskFn,
2060 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2061 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2063
2064 } else {
2065 // Emit the @__kmpc_omp_task runtime call to spawn the task
2066 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2067 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2068 }
2069
2070 StaleCI->eraseFromParent();
2071
2072 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2073 if (HasShareds) {
2074 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2075 OutlinedFn.getArg(1)->replaceUsesWithIf(
2076 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2077 }
2078
2079 for (Instruction *I : llvm::reverse(ToBeDeleted))
2080 I->eraseFromParent();
2081 };
2082
2083 addOutlineInfo(std::move(OI));
2084 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2085
2086 return Builder.saveIP();
2087}
2088
2091 InsertPointTy AllocaIP,
2092 BodyGenCallbackTy BodyGenCB) {
2093 if (!updateToLocation(Loc))
2094 return InsertPointTy();
2095
2096 uint32_t SrcLocStrSize;
2097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2099 Value *ThreadID = getOrCreateThreadID(Ident);
2100
2101 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2102 Function *TaskgroupFn =
2103 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2104 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2105
2106 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2107 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2108 return Err;
2109
2110 Builder.SetInsertPoint(TaskgroupExitBB);
2111 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2112 Function *EndTaskgroupFn =
2113 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2114 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2115
2116 return Builder.saveIP();
2117}
2118
2120 const LocationDescription &Loc, InsertPointTy AllocaIP,
2122 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2123 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2124
2125 if (!updateToLocation(Loc))
2126 return Loc.IP;
2127
2128 auto FiniCBWrapper = [&](InsertPointTy IP) {
2129 if (IP.getBlock()->end() != IP.getPoint())
2130 return FiniCB(IP);
2131 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2132 // will fail because that function requires the Finalization Basic Block to
2133 // have a terminator, which is already removed by EmitOMPRegionBody.
2134 // IP is currently at cancelation block.
2135 // We need to backtrack to the condition block to fetch
2136 // the exit block and create a branch from cancelation
2137 // to exit block.
2139 Builder.restoreIP(IP);
2140 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2141 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2142 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2143 Instruction *I = Builder.CreateBr(ExitBB);
2144 IP = InsertPointTy(I->getParent(), I->getIterator());
2145 return FiniCB(IP);
2146 };
2147
2148 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2149
2150 // Each section is emitted as a switch case
2151 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2152 // -> OMP.createSection() which generates the IR for each section
2153 // Iterate through all sections and emit a switch construct:
2154 // switch (IV) {
2155 // case 0:
2156 // <SectionStmt[0]>;
2157 // break;
2158 // ...
2159 // case <NumSection> - 1:
2160 // <SectionStmt[<NumSection> - 1]>;
2161 // break;
2162 // }
2163 // ...
2164 // section_loop.after:
2165 // <FiniCB>;
2166 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2167 Builder.restoreIP(CodeGenIP);
2169 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2170 Function *CurFn = Continue->getParent();
2171 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2172
2173 unsigned CaseNumber = 0;
2174 for (auto SectionCB : SectionCBs) {
2176 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2177 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2178 Builder.SetInsertPoint(CaseBB);
2179 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2180 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2181 CaseEndBr->getIterator()}))
2182 return Err;
2183 CaseNumber++;
2184 }
2185 // remove the existing terminator from body BB since there can be no
2186 // terminators after switch/case
2187 return Error::success();
2188 };
2189 // Loop body ends here
2190 // LowerBound, UpperBound, and STride for createCanonicalLoop
2191 Type *I32Ty = Type::getInt32Ty(M.getContext());
2192 Value *LB = ConstantInt::get(I32Ty, 0);
2193 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2194 Value *ST = ConstantInt::get(I32Ty, 1);
2196 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2197 if (!LoopInfo)
2198 return LoopInfo.takeError();
2199
2200 InsertPointOrErrorTy WsloopIP =
2201 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2202 if (!WsloopIP)
2203 return WsloopIP.takeError();
2204 InsertPointTy AfterIP = *WsloopIP;
2205
2206 // Apply the finalization callback in LoopAfterBB
2207 auto FiniInfo = FinalizationStack.pop_back_val();
2208 assert(FiniInfo.DK == OMPD_sections &&
2209 "Unexpected finalization stack state!");
2210 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2211 Builder.restoreIP(AfterIP);
2212 BasicBlock *FiniBB =
2213 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2214 if (Error Err = CB(Builder.saveIP()))
2215 return Err;
2216 AfterIP = {FiniBB, FiniBB->begin()};
2217 }
2218
2219 return AfterIP;
2220}
2221
2224 BodyGenCallbackTy BodyGenCB,
2225 FinalizeCallbackTy FiniCB) {
2226 if (!updateToLocation(Loc))
2227 return Loc.IP;
2228
2229 auto FiniCBWrapper = [&](InsertPointTy IP) {
2230 if (IP.getBlock()->end() != IP.getPoint())
2231 return FiniCB(IP);
2232 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2233 // will fail because that function requires the Finalization Basic Block to
2234 // have a terminator, which is already removed by EmitOMPRegionBody.
2235 // IP is currently at cancelation block.
2236 // We need to backtrack to the condition block to fetch
2237 // the exit block and create a branch from cancelation
2238 // to exit block.
2240 Builder.restoreIP(IP);
2241 auto *CaseBB = Loc.IP.getBlock();
2242 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2243 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2244 Instruction *I = Builder.CreateBr(ExitBB);
2245 IP = InsertPointTy(I->getParent(), I->getIterator());
2246 return FiniCB(IP);
2247 };
2248
2249 Directive OMPD = Directive::OMPD_sections;
2250 // Since we are using Finalization Callback here, HasFinalize
2251 // and IsCancellable have to be true
2252 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2253 /*Conditional*/ false, /*hasFinalize*/ true,
2254 /*IsCancellable*/ true);
2255}
2256
2259 IT++;
2260 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2261}
2262
2263void OpenMPIRBuilder::emitUsed(StringRef Name,
2264 std::vector<WeakTrackingVH> &List) {
2265 if (List.empty())
2266 return;
2267
2268 // Convert List to what ConstantArray needs.
2270 UsedArray.resize(List.size());
2271 for (unsigned I = 0, E = List.size(); I != E; ++I)
2273 cast<Constant>(&*List[I]), Builder.getPtrTy());
2274
2275 if (UsedArray.empty())
2276 return;
2277 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2278
2279 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2280 ConstantArray::get(ATy, UsedArray), Name);
2281
2282 GV->setSection("llvm.metadata");
2283}
2284
2285Value *OpenMPIRBuilder::getGPUThreadID() {
2286 return Builder.CreateCall(
2288 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2289 {});
2290}
2291
2292Value *OpenMPIRBuilder::getGPUWarpSize() {
2293 return Builder.CreateCall(
2294 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2295}
2296
2297Value *OpenMPIRBuilder::getNVPTXWarpID() {
2298 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2299 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2300}
2301
2302Value *OpenMPIRBuilder::getNVPTXLaneID() {
2303 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2304 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2305 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2306 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2307 "nvptx_lane_id");
2308}
2309
2310Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2311 Type *ToType) {
2312 Type *FromType = From->getType();
2313 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2314 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2315 assert(FromSize > 0 && "From size must be greater than zero");
2316 assert(ToSize > 0 && "To size must be greater than zero");
2317 if (FromType == ToType)
2318 return From;
2319 if (FromSize == ToSize)
2320 return Builder.CreateBitCast(From, ToType);
2321 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2322 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2323 InsertPointTy SaveIP = Builder.saveIP();
2324 Builder.restoreIP(AllocaIP);
2325 Value *CastItem = Builder.CreateAlloca(ToType);
2326 Builder.restoreIP(SaveIP);
2327
2329 CastItem, Builder.getPtrTy(0));
2330 Builder.CreateStore(From, ValCastItem);
2331 return Builder.CreateLoad(ToType, CastItem);
2332}
2333
2334Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2335 Value *Element,
2336 Type *ElementType,
2337 Value *Offset) {
2338 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2339 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2340
2341 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2342 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2343 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2344 Value *WarpSize =
2345 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2347 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2348 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2349 Value *WarpSizeCast =
2350 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2351 Value *ShuffleCall =
2352 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2353 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2354}
2355
2356void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2357 Value *DstAddr, Type *ElemType,
2358 Value *Offset, Type *ReductionArrayTy) {
2360 // Create the loop over the big sized data.
2361 // ptr = (void*)Elem;
2362 // ptrEnd = (void*) Elem + 1;
2363 // Step = 8;
2364 // while (ptr + Step < ptrEnd)
2365 // shuffle((int64_t)*ptr);
2366 // Step = 4;
2367 // while (ptr + Step < ptrEnd)
2368 // shuffle((int32_t)*ptr);
2369 // ...
2370 Type *IndexTy = Builder.getIndexTy(
2372 Value *ElemPtr = DstAddr;
2373 Value *Ptr = SrcAddr;
2374 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2375 if (Size < IntSize)
2376 continue;
2377 Type *IntType = Builder.getIntNTy(IntSize * 8);
2379 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2380 Value *SrcAddrGEP =
2381 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2383 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2384
2385 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2386 if ((Size / IntSize) > 1) {
2388 SrcAddrGEP, Builder.getPtrTy());
2389 BasicBlock *PreCondBB =
2390 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2391 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2392 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2393 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2394 emitBlock(PreCondBB, CurFunc);
2395 PHINode *PhiSrc =
2396 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2397 PhiSrc->addIncoming(Ptr, CurrentBB);
2398 PHINode *PhiDest =
2399 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2400 PhiDest->addIncoming(ElemPtr, CurrentBB);
2401 Ptr = PhiSrc;
2402 ElemPtr = PhiDest;
2403 Value *PtrDiff = Builder.CreatePtrDiff(
2404 Builder.getInt8Ty(), PtrEnd,
2407 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2408 ExitBB);
2409 emitBlock(ThenBB, CurFunc);
2410 Value *Res = createRuntimeShuffleFunction(
2411 AllocaIP,
2413 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2414 IntType, Offset);
2415 Builder.CreateAlignedStore(Res, ElemPtr,
2416 M.getDataLayout().getPrefTypeAlign(ElemType));
2417 Value *LocalPtr =
2418 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2419 Value *LocalElemPtr =
2420 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2421 PhiSrc->addIncoming(LocalPtr, ThenBB);
2422 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2423 emitBranch(PreCondBB);
2424 emitBlock(ExitBB, CurFunc);
2425 } else {
2426 Value *Res = createRuntimeShuffleFunction(
2427 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2428 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2429 Res->getType()->getScalarSizeInBits())
2430 Res = Builder.CreateTrunc(Res, ElemType);
2431 Builder.CreateStore(Res, ElemPtr);
2432 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2433 ElemPtr =
2434 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2435 }
2436 Size = Size % IntSize;
2437 }
2438}
2439
2440void OpenMPIRBuilder::emitReductionListCopy(
2441 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2442 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2443 CopyOptionsTy CopyOptions) {
2444 Type *IndexTy = Builder.getIndexTy(
2446 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2447
2448 // Iterates, element-by-element, through the source Reduce list and
2449 // make a copy.
2450 for (auto En : enumerate(ReductionInfos)) {
2451 const ReductionInfo &RI = En.value();
2452 Value *SrcElementAddr = nullptr;
2453 Value *DestElementAddr = nullptr;
2454 Value *DestElementPtrAddr = nullptr;
2455 // Should we shuffle in an element from a remote lane?
2456 bool ShuffleInElement = false;
2457 // Set to true to update the pointer in the dest Reduce list to a
2458 // newly created element.
2459 bool UpdateDestListPtr = false;
2460
2461 // Step 1.1: Get the address for the src element in the Reduce list.
2462 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2463 ReductionArrayTy, SrcBase,
2464 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2465 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2466
2467 // Step 1.2: Create a temporary to store the element in the destination
2468 // Reduce list.
2469 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2470 ReductionArrayTy, DestBase,
2471 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2472 switch (Action) {
2474 InsertPointTy CurIP = Builder.saveIP();
2475 Builder.restoreIP(AllocaIP);
2476 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2477 ".omp.reduction.element");
2478 DestAlloca->setAlignment(
2479 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2480 DestElementAddr = DestAlloca;
2481 DestElementAddr =
2482 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2483 DestElementAddr->getName() + ".ascast");
2484 Builder.restoreIP(CurIP);
2485 ShuffleInElement = true;
2486 UpdateDestListPtr = true;
2487 break;
2488 }
2490 DestElementAddr =
2491 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2492 break;
2493 }
2494 }
2495
2496 // Now that all active lanes have read the element in the
2497 // Reduce list, shuffle over the value from the remote lane.
2498 if (ShuffleInElement) {
2499 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2500 RemoteLaneOffset, ReductionArrayTy);
2501 } else {
2502 switch (RI.EvaluationKind) {
2503 case EvalKind::Scalar: {
2504 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2505 // Store the source element value to the dest element address.
2506 Builder.CreateStore(Elem, DestElementAddr);
2507 break;
2508 }
2509 case EvalKind::Complex: {
2511 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2512 Value *SrcReal = Builder.CreateLoad(
2513 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2515 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2516 Value *SrcImg = Builder.CreateLoad(
2517 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2518
2520 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2522 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2523 Builder.CreateStore(SrcReal, DestRealPtr);
2524 Builder.CreateStore(SrcImg, DestImgPtr);
2525 break;
2526 }
2527 case EvalKind::Aggregate: {
2528 Value *SizeVal = Builder.getInt64(
2529 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2531 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2532 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2533 SizeVal, false);
2534 break;
2535 }
2536 };
2537 }
2538
2539 // Step 3.1: Modify reference in dest Reduce list as needed.
2540 // Modifying the reference in Reduce list to point to the newly
2541 // created element. The element is live in the current function
2542 // scope and that of functions it invokes (i.e., reduce_function).
2543 // RemoteReduceData[i] = (void*)&RemoteElem
2544 if (UpdateDestListPtr) {
2546 DestElementAddr, Builder.getPtrTy(),
2547 DestElementAddr->getName() + ".ascast");
2548 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2549 }
2550 }
2551}
2552
2553Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2554 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2555 AttributeList FuncAttrs) {
2556 InsertPointTy SavedIP = Builder.saveIP();
2557 LLVMContext &Ctx = M.getContext();
2559 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2560 /* IsVarArg */ false);
2561 Function *WcFunc =
2563 "_omp_reduction_inter_warp_copy_func", &M);
2564 WcFunc->setAttributes(FuncAttrs);
2565 WcFunc->addParamAttr(0, Attribute::NoUndef);
2566 WcFunc->addParamAttr(1, Attribute::NoUndef);
2567 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2568 Builder.SetInsertPoint(EntryBB);
2569
2570 // ReduceList: thread local Reduce list.
2571 // At the stage of the computation when this function is called, partially
2572 // aggregated values reside in the first lane of every active warp.
2573 Argument *ReduceListArg = WcFunc->getArg(0);
2574 // NumWarps: number of warps active in the parallel region. This could
2575 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2576 Argument *NumWarpsArg = WcFunc->getArg(1);
2577
2578 // This array is used as a medium to transfer, one reduce element at a time,
2579 // the data from the first lane of every warp to lanes in the first warp
2580 // in order to perform the final step of a reduction in a parallel region
2581 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2582 // for reduced latency, as well as to have a distinct copy for concurrently
2583 // executing target regions. The array is declared with common linkage so
2584 // as to be shared across compilation units.
2585 StringRef TransferMediumName =
2586 "__openmp_nvptx_data_transfer_temporary_storage";
2587 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2588 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2589 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2590 if (!TransferMedium) {
2591 TransferMedium = new GlobalVariable(
2592 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2593 UndefValue::get(ArrayTy), TransferMediumName,
2594 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2595 /*AddressSpace=*/3);
2596 }
2597
2598 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2599 Value *GPUThreadID = getGPUThreadID();
2600 // nvptx_lane_id = nvptx_id % warpsize
2601 Value *LaneID = getNVPTXLaneID();
2602 // nvptx_warp_id = nvptx_id / warpsize
2603 Value *WarpID = getNVPTXWarpID();
2604
2605 InsertPointTy AllocaIP =
2608 Type *Arg0Type = ReduceListArg->getType();
2609 Type *Arg1Type = NumWarpsArg->getType();
2610 Builder.restoreIP(AllocaIP);
2611 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2612 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2613 AllocaInst *NumWarpsAlloca =
2614 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2616 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2618 NumWarpsAlloca, Builder.getPtrTy(0),
2619 NumWarpsAlloca->getName() + ".ascast");
2620 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2621 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2622 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2623 InsertPointTy CodeGenIP =
2625 Builder.restoreIP(CodeGenIP);
2626
2627 Value *ReduceList =
2628 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2629
2630 for (auto En : enumerate(ReductionInfos)) {
2631 //
2632 // Warp master copies reduce element to transfer medium in __shared__
2633 // memory.
2634 //
2635 const ReductionInfo &RI = En.value();
2636 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2637 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2638 Type *CType = Builder.getIntNTy(TySize * 8);
2639
2640 unsigned NumIters = RealTySize / TySize;
2641 if (NumIters == 0)
2642 continue;
2643 Value *Cnt = nullptr;
2644 Value *CntAddr = nullptr;
2645 BasicBlock *PrecondBB = nullptr;
2646 BasicBlock *ExitBB = nullptr;
2647 if (NumIters > 1) {
2648 CodeGenIP = Builder.saveIP();
2649 Builder.restoreIP(AllocaIP);
2650 CntAddr =
2651 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2652
2653 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2654 CntAddr->getName() + ".ascast");
2655 Builder.restoreIP(CodeGenIP);
2657 CntAddr,
2658 /*Volatile=*/false);
2659 PrecondBB = BasicBlock::Create(Ctx, "precond");
2660 ExitBB = BasicBlock::Create(Ctx, "exit");
2661 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2662 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2663 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2664 /*Volatile=*/false);
2666 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2667 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2669 }
2670
2671 // kmpc_barrier.
2672 InsertPointOrErrorTy BarrierIP1 =
2673 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2674 omp::Directive::OMPD_unknown,
2675 /* ForceSimpleCall */ false,
2676 /* CheckCancelFlag */ true);
2677 if (!BarrierIP1)
2678 return BarrierIP1.takeError();
2679 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2680 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2681 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2682
2683 // if (lane_id == 0)
2684 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2685 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2687
2688 // Reduce element = LocalReduceList[i]
2689 auto *RedListArrayTy =
2690 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2691 Type *IndexTy = Builder.getIndexTy(
2693 Value *ElemPtrPtr =
2694 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2695 {ConstantInt::get(IndexTy, 0),
2696 ConstantInt::get(IndexTy, En.index())});
2697 // elemptr = ((CopyType*)(elemptrptr)) + I
2698 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2699 if (NumIters > 1)
2700 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2701
2702 // Get pointer to location in transfer medium.
2703 // MediumPtr = &medium[warp_id]
2704 Value *MediumPtr = Builder.CreateInBoundsGEP(
2705 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2706 // elem = *elemptr
2707 //*MediumPtr = elem
2708 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2709 // Store the source element value to the dest element address.
2710 Builder.CreateStore(Elem, MediumPtr,
2711 /*IsVolatile*/ true);
2712 Builder.CreateBr(MergeBB);
2713
2714 // else
2716 Builder.CreateBr(MergeBB);
2717
2718 // endif
2720 InsertPointOrErrorTy BarrierIP2 =
2721 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2722 omp::Directive::OMPD_unknown,
2723 /* ForceSimpleCall */ false,
2724 /* CheckCancelFlag */ true);
2725 if (!BarrierIP2)
2726 return BarrierIP2.takeError();
2727
2728 // Warp 0 copies reduce element from transfer medium
2729 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2730 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2731 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2732
2733 Value *NumWarpsVal =
2734 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2735 // Up to 32 threads in warp 0 are active.
2736 Value *IsActiveThread =
2737 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2738 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2739
2740 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2741
2742 // SecMediumPtr = &medium[tid]
2743 // SrcMediumVal = *SrcMediumPtr
2744 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2745 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2746 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2747 Value *TargetElemPtrPtr =
2748 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2749 {ConstantInt::get(IndexTy, 0),
2750 ConstantInt::get(IndexTy, En.index())});
2751 Value *TargetElemPtrVal =
2752 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2753 Value *TargetElemPtr = TargetElemPtrVal;
2754 if (NumIters > 1)
2755 TargetElemPtr =
2756 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2757
2758 // *TargetElemPtr = SrcMediumVal;
2759 Value *SrcMediumValue =
2760 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2761 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2762 Builder.CreateBr(W0MergeBB);
2763
2764 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2765 Builder.CreateBr(W0MergeBB);
2766
2767 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2768
2769 if (NumIters > 1) {
2770 Cnt = Builder.CreateNSWAdd(
2771 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2772 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2773
2774 auto *CurFn = Builder.GetInsertBlock()->getParent();
2775 emitBranch(PrecondBB);
2776 emitBlock(ExitBB, CurFn);
2777 }
2778 RealTySize %= TySize;
2779 }
2780 }
2781
2783 Builder.restoreIP(SavedIP);
2784
2785 return WcFunc;
2786}
2787
2788Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2789 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2790 AttributeList FuncAttrs) {
2791 LLVMContext &Ctx = M.getContext();
2792 FunctionType *FuncTy =
2794 {Builder.getPtrTy(), Builder.getInt16Ty(),
2795 Builder.getInt16Ty(), Builder.getInt16Ty()},
2796 /* IsVarArg */ false);
2797 Function *SarFunc =
2799 "_omp_reduction_shuffle_and_reduce_func", &M);
2800 SarFunc->setAttributes(FuncAttrs);
2801 SarFunc->addParamAttr(0, Attribute::NoUndef);
2802 SarFunc->addParamAttr(1, Attribute::NoUndef);
2803 SarFunc->addParamAttr(2, Attribute::NoUndef);
2804 SarFunc->addParamAttr(3, Attribute::NoUndef);
2805 SarFunc->addParamAttr(1, Attribute::SExt);
2806 SarFunc->addParamAttr(2, Attribute::SExt);
2807 SarFunc->addParamAttr(3, Attribute::SExt);
2808 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2809 Builder.SetInsertPoint(EntryBB);
2810
2811 // Thread local Reduce list used to host the values of data to be reduced.
2812 Argument *ReduceListArg = SarFunc->getArg(0);
2813 // Current lane id; could be logical.
2814 Argument *LaneIDArg = SarFunc->getArg(1);
2815 // Offset of the remote source lane relative to the current lane.
2816 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2817 // Algorithm version. This is expected to be known at compile time.
2818 Argument *AlgoVerArg = SarFunc->getArg(3);
2819
2820 Type *ReduceListArgType = ReduceListArg->getType();
2821 Type *LaneIDArgType = LaneIDArg->getType();
2822 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2823 Value *ReduceListAlloca = Builder.CreateAlloca(
2824 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2825 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2826 LaneIDArg->getName() + ".addr");
2827 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2828 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2829 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2830 AlgoVerArg->getName() + ".addr");
2831 ArrayType *RedListArrayTy =
2832 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2833
2834 // Create a local thread-private variable to host the Reduce list
2835 // from a remote lane.
2836 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2837 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2838
2840 ReduceListAlloca, ReduceListArgType,
2841 ReduceListAlloca->getName() + ".ascast");
2843 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2844 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2845 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2846 RemoteLaneOffsetAlloca->getName() + ".ascast");
2848 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2850 RemoteReductionListAlloca, Builder.getPtrTy(),
2851 RemoteReductionListAlloca->getName() + ".ascast");
2852
2853 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2854 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2855 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2856 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2857
2858 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2859 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2860 Value *RemoteLaneOffset =
2861 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2862 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2863
2864 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2865
2866 // This loop iterates through the list of reduce elements and copies,
2867 // element by element, from a remote lane in the warp to RemoteReduceList,
2868 // hosted on the thread's stack.
2869 emitReductionListCopy(
2870 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2871 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2872
2873 // The actions to be performed on the Remote Reduce list is dependent
2874 // on the algorithm version.
2875 //
2876 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2877 // LaneId % 2 == 0 && Offset > 0):
2878 // do the reduction value aggregation
2879 //
2880 // The thread local variable Reduce list is mutated in place to host the
2881 // reduced data, which is the aggregated value produced from local and
2882 // remote lanes.
2883 //
2884 // Note that AlgoVer is expected to be a constant integer known at compile
2885 // time.
2886 // When AlgoVer==0, the first conjunction evaluates to true, making
2887 // the entire predicate true during compile time.
2888 // When AlgoVer==1, the second conjunction has only the second part to be
2889 // evaluated during runtime. Other conjunctions evaluates to false
2890 // during compile time.
2891 // When AlgoVer==2, the third conjunction has only the second part to be
2892 // evaluated during runtime. Other conjunctions evaluates to false
2893 // during compile time.
2894 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2895 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2896 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2897 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2898 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2899 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2900 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2901 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2902 Value *RemoteOffsetComp =
2903 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2904 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2905 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2906 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2907
2908 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2909 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2910 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2911
2912 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2915 ReduceList, Builder.getPtrTy());
2916 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2917 RemoteListAddrCast, Builder.getPtrTy());
2918 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2919 ->addFnAttr(Attribute::NoUnwind);
2920 Builder.CreateBr(MergeBB);
2921
2923 Builder.CreateBr(MergeBB);
2924
2926
2927 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2928 // Reduce list.
2929 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2930 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2931 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2932
2933 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2934 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2935 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2936 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2937
2938 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2939 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2940 ReductionInfos, RemoteListAddrCast, ReduceList);
2941 Builder.CreateBr(CpyMergeBB);
2942
2943 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2944 Builder.CreateBr(CpyMergeBB);
2945
2946 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2947
2949
2950 return SarFunc;
2951}
2952
2953Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2954 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2955 AttributeList FuncAttrs) {
2957 LLVMContext &Ctx = M.getContext();
2960 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2961 /* IsVarArg */ false);
2962 Function *LtGCFunc =
2964 "_omp_reduction_list_to_global_copy_func", &M);
2965 LtGCFunc->setAttributes(FuncAttrs);
2966 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2967 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2968 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2969
2970 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2971 Builder.SetInsertPoint(EntryBlock);
2972
2973 // Buffer: global reduction buffer.
2974 Argument *BufferArg = LtGCFunc->getArg(0);
2975 // Idx: index of the buffer.
2976 Argument *IdxArg = LtGCFunc->getArg(1);
2977 // ReduceList: thread local Reduce list.
2978 Argument *ReduceListArg = LtGCFunc->getArg(2);
2979
2980 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2981 BufferArg->getName() + ".addr");
2982 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2983 IdxArg->getName() + ".addr");
2984 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2985 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2987 BufferArgAlloca, Builder.getPtrTy(),
2988 BufferArgAlloca->getName() + ".ascast");
2990 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2991 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2992 ReduceListArgAlloca, Builder.getPtrTy(),
2993 ReduceListArgAlloca->getName() + ".ascast");
2994
2995 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2996 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2997 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2998
2999 Value *LocalReduceList =
3000 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3001 Value *BufferArgVal =
3002 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3003 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3004 Type *IndexTy = Builder.getIndexTy(
3006 for (auto En : enumerate(ReductionInfos)) {
3007 const ReductionInfo &RI = En.value();
3008 auto *RedListArrayTy =
3009 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3010 // Reduce element = LocalReduceList[i]
3011 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3012 RedListArrayTy, LocalReduceList,
3013 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3014 // elemptr = ((CopyType*)(elemptrptr)) + I
3015 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3016
3017 // Global = Buffer.VD[Idx];
3018 Value *BufferVD =
3019 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3021 ReductionsBufferTy, BufferVD, 0, En.index());
3022
3023 switch (RI.EvaluationKind) {
3024 case EvalKind::Scalar: {
3025 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3026 Builder.CreateStore(TargetElement, GlobVal);
3027 break;
3028 }
3029 case EvalKind::Complex: {
3031 RI.ElementType, ElemPtr, 0, 0, ".realp");
3032 Value *SrcReal = Builder.CreateLoad(
3033 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3035 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3036 Value *SrcImg = Builder.CreateLoad(
3037 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3038
3040 RI.ElementType, GlobVal, 0, 0, ".realp");
3042 RI.ElementType, GlobVal, 0, 1, ".imagp");
3043 Builder.CreateStore(SrcReal, DestRealPtr);
3044 Builder.CreateStore(SrcImg, DestImgPtr);
3045 break;
3046 }
3047 case EvalKind::Aggregate: {
3048 Value *SizeVal =
3049 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3051 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3052 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3053 break;
3054 }
3055 }
3056 }
3057
3059 Builder.restoreIP(OldIP);
3060 return LtGCFunc;
3061}
3062
3063Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3064 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3065 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3067 LLVMContext &Ctx = M.getContext();
3070 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3071 /* IsVarArg */ false);
3072 Function *LtGRFunc =
3074 "_omp_reduction_list_to_global_reduce_func", &M);
3075 LtGRFunc->setAttributes(FuncAttrs);
3076 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3077 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3078 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3079
3080 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3081 Builder.SetInsertPoint(EntryBlock);
3082
3083 // Buffer: global reduction buffer.
3084 Argument *BufferArg = LtGRFunc->getArg(0);
3085 // Idx: index of the buffer.
3086 Argument *IdxArg = LtGRFunc->getArg(1);
3087 // ReduceList: thread local Reduce list.
3088 Argument *ReduceListArg = LtGRFunc->getArg(2);
3089
3090 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3091 BufferArg->getName() + ".addr");
3092 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3093 IdxArg->getName() + ".addr");
3094 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3095 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3096 auto *RedListArrayTy =
3097 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3098
3099 // 1. Build a list of reduction variables.
3100 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3101 Value *LocalReduceList =
3102 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3103
3105 BufferArgAlloca, Builder.getPtrTy(),
3106 BufferArgAlloca->getName() + ".ascast");
3108 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3109 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3110 ReduceListArgAlloca, Builder.getPtrTy(),
3111 ReduceListArgAlloca->getName() + ".ascast");
3112 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3113 LocalReduceList, Builder.getPtrTy(),
3114 LocalReduceList->getName() + ".ascast");
3115
3116 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3117 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3118 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3119
3120 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3121 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3122 Type *IndexTy = Builder.getIndexTy(
3124 for (auto En : enumerate(ReductionInfos)) {
3125 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3126 RedListArrayTy, LocalReduceListAddrCast,
3127 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3128 Value *BufferVD =
3129 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3130 // Global = Buffer.VD[Idx];
3132 ReductionsBufferTy, BufferVD, 0, En.index());
3133 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3134 }
3135
3136 // Call reduce_function(GlobalReduceList, ReduceList)
3137 Value *ReduceList =
3138 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3139 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3140 ->addFnAttr(Attribute::NoUnwind);
3142 Builder.restoreIP(OldIP);
3143 return LtGRFunc;
3144}
3145
3146Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3147 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3148 AttributeList FuncAttrs) {
3150 LLVMContext &Ctx = M.getContext();
3153 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3154 /* IsVarArg */ false);
3155 Function *LtGCFunc =
3157 "_omp_reduction_global_to_list_copy_func", &M);
3158 LtGCFunc->setAttributes(FuncAttrs);
3159 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3160 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3161 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3162
3163 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3164 Builder.SetInsertPoint(EntryBlock);
3165
3166 // Buffer: global reduction buffer.
3167 Argument *BufferArg = LtGCFunc->getArg(0);
3168 // Idx: index of the buffer.
3169 Argument *IdxArg = LtGCFunc->getArg(1);
3170 // ReduceList: thread local Reduce list.
3171 Argument *ReduceListArg = LtGCFunc->getArg(2);
3172
3173 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3174 BufferArg->getName() + ".addr");
3175 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3176 IdxArg->getName() + ".addr");
3177 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3178 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3180 BufferArgAlloca, Builder.getPtrTy(),
3181 BufferArgAlloca->getName() + ".ascast");
3183 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3184 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3185 ReduceListArgAlloca, Builder.getPtrTy(),
3186 ReduceListArgAlloca->getName() + ".ascast");
3187 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3188 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3189 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3190
3191 Value *LocalReduceList =
3192 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3193 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3194 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3195 Type *IndexTy = Builder.getIndexTy(
3197 for (auto En : enumerate(ReductionInfos)) {
3198 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3199 auto *RedListArrayTy =
3200 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3201 // Reduce element = LocalReduceList[i]
3202 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3203 RedListArrayTy, LocalReduceList,
3204 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3205 // elemptr = ((CopyType*)(elemptrptr)) + I
3206 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3207 // Global = Buffer.VD[Idx];
3208 Value *BufferVD =
3209 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3211 ReductionsBufferTy, BufferVD, 0, En.index());
3212
3213 switch (RI.EvaluationKind) {
3214 case EvalKind::Scalar: {
3215 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3216 Builder.CreateStore(TargetElement, ElemPtr);
3217 break;
3218 }
3219 case EvalKind::Complex: {
3221 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3222 Value *SrcReal = Builder.CreateLoad(
3223 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3225 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3226 Value *SrcImg = Builder.CreateLoad(
3227 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3228
3230 RI.ElementType, ElemPtr, 0, 0, ".realp");
3232 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3233 Builder.CreateStore(SrcReal, DestRealPtr);
3234 Builder.CreateStore(SrcImg, DestImgPtr);
3235 break;
3236 }
3237 case EvalKind::Aggregate: {
3238 Value *SizeVal =
3242 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3243 SizeVal, false);
3244 break;
3245 }
3246 }
3247 }
3248
3250 Builder.restoreIP(OldIP);
3251 return LtGCFunc;
3252}
3253
3254Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3255 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3256 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3258 LLVMContext &Ctx = M.getContext();
3259 auto *FuncTy = FunctionType::get(
3261 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3262 /* IsVarArg */ false);
3263 Function *LtGRFunc =
3265 "_omp_reduction_global_to_list_reduce_func", &M);
3266 LtGRFunc->setAttributes(FuncAttrs);
3267 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3268 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3269 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3270
3271 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3272 Builder.SetInsertPoint(EntryBlock);
3273
3274 // Buffer: global reduction buffer.
3275 Argument *BufferArg = LtGRFunc->getArg(0);
3276 // Idx: index of the buffer.
3277 Argument *IdxArg = LtGRFunc->getArg(1);
3278 // ReduceList: thread local Reduce list.
3279 Argument *ReduceListArg = LtGRFunc->getArg(2);
3280
3281 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3282 BufferArg->getName() + ".addr");
3283 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3284 IdxArg->getName() + ".addr");
3285 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3286 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3287 ArrayType *RedListArrayTy =
3288 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3289
3290 // 1. Build a list of reduction variables.
3291 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3292 Value *LocalReduceList =
3293 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3294
3296 BufferArgAlloca, Builder.getPtrTy(),
3297 BufferArgAlloca->getName() + ".ascast");
3299 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3300 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3301 ReduceListArgAlloca, Builder.getPtrTy(),
3302 ReduceListArgAlloca->getName() + ".ascast");
3304 LocalReduceList, Builder.getPtrTy(),
3305 LocalReduceList->getName() + ".ascast");
3306
3307 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3308 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3309 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3310
3311 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3312 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3313 Type *IndexTy = Builder.getIndexTy(
3315 for (auto En : enumerate(ReductionInfos)) {
3316 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3317 RedListArrayTy, ReductionList,
3318 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3319 // Global = Buffer.VD[Idx];
3320 Value *BufferVD =
3321 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3323 ReductionsBufferTy, BufferVD, 0, En.index());
3324 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3325 }
3326
3327 // Call reduce_function(ReduceList, GlobalReduceList)
3328 Value *ReduceList =
3329 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3330 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3331 ->addFnAttr(Attribute::NoUnwind);
3333 Builder.restoreIP(OldIP);
3334 return LtGRFunc;
3335}
3336
3337std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3338 std::string Suffix =
3339 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3340 return (Name + Suffix).str();
3341}
3342
3343Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3344 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3345 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3346 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3347 {Builder.getPtrTy(), Builder.getPtrTy()},
3348 /* IsVarArg */ false);
3349 std::string Name = getReductionFuncName(ReducerName);
3350 Function *ReductionFunc =
3352 ReductionFunc->setAttributes(FuncAttrs);
3353 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3354 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3355 BasicBlock *EntryBB =
3356 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3357 Builder.SetInsertPoint(EntryBB);
3358
3359 // Need to alloca memory here and deal with the pointers before getting
3360 // LHS/RHS pointers out
3361 Value *LHSArrayPtr = nullptr;
3362 Value *RHSArrayPtr = nullptr;
3363 Argument *Arg0 = ReductionFunc->getArg(0);
3364 Argument *Arg1 = ReductionFunc->getArg(1);
3365 Type *Arg0Type = Arg0->getType();
3366 Type *Arg1Type = Arg1->getType();
3367
3368 Value *LHSAlloca =
3369 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3370 Value *RHSAlloca =
3371 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3373 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3375 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3376 Builder.CreateStore(Arg0, LHSAddrCast);
3377 Builder.CreateStore(Arg1, RHSAddrCast);
3378 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3379 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3380
3381 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3382 Type *IndexTy = Builder.getIndexTy(
3384 SmallVector<Value *> LHSPtrs, RHSPtrs;
3385 for (auto En : enumerate(ReductionInfos)) {
3386 const ReductionInfo &RI = En.value();
3387 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3388 RedArrayTy, RHSArrayPtr,
3389 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3390 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3392 RHSI8Ptr, RI.PrivateVariable->getType(),
3393 RHSI8Ptr->getName() + ".ascast");
3394
3395 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3396 RedArrayTy, LHSArrayPtr,
3397 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3398 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3400 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3401
3403 LHSPtrs.emplace_back(LHSPtr);
3404 RHSPtrs.emplace_back(RHSPtr);
3405 } else {
3406 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3407 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3408 Value *Reduced;
3409 InsertPointOrErrorTy AfterIP =
3410 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3411 if (!AfterIP)
3412 return AfterIP.takeError();
3413 if (!Builder.GetInsertBlock())
3414 return ReductionFunc;
3415 Builder.CreateStore(Reduced, LHSPtr);
3416 }
3417 }
3418
3420 for (auto En : enumerate(ReductionInfos)) {
3421 unsigned Index = En.index();
3422 const ReductionInfo &RI = En.value();
3423 Value *LHSFixupPtr, *RHSFixupPtr;
3424 Builder.restoreIP(RI.ReductionGenClang(
3425 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3426
3427 // Fix the CallBack code genereated to use the correct Values for the LHS
3428 // and RHS
3429 LHSFixupPtr->replaceUsesWithIf(
3430 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3431 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3432 ReductionFunc;
3433 });
3434 RHSFixupPtr->replaceUsesWithIf(
3435 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3436 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3437 ReductionFunc;
3438 });
3439 }
3440
3442 return ReductionFunc;
3443}
3444
3445static void
3447 bool IsGPU) {
3448 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3449 (void)RI;
3450 assert(RI.Variable && "expected non-null variable");
3451 assert(RI.PrivateVariable && "expected non-null private variable");
3452 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3453 "expected non-null reduction generator callback");
3454 if (!IsGPU) {
3455 assert(
3456 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3457 "expected variables and their private equivalents to have the same "
3458 "type");
3459 }
3460 assert(RI.Variable->getType()->isPointerTy() &&
3461 "expected variables to be pointers");
3462 }
3463}
3464
3466 const LocationDescription &Loc, InsertPointTy AllocaIP,
3467 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3468 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3469 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3470 unsigned ReductionBufNum, Value *SrcLocInfo) {
3471 if (!updateToLocation(Loc))
3472 return InsertPointTy();
3473 Builder.restoreIP(CodeGenIP);
3474 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3475 LLVMContext &Ctx = M.getContext();
3476
3477 // Source location for the ident struct
3478 if (!SrcLocInfo) {
3479 uint32_t SrcLocStrSize;
3480 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3481 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3482 }
3483
3484 if (ReductionInfos.size() == 0)
3485 return Builder.saveIP();
3486
3487 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3488 AttributeList FuncAttrs;
3489 AttrBuilder AttrBldr(Ctx);
3490 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3491 AttrBldr.addAttribute(Attr);
3492 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3493 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3494
3495 CodeGenIP = Builder.saveIP();
3496 Expected<Function *> ReductionResult =
3497 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3498 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3499 if (!ReductionResult)
3500 return ReductionResult.takeError();
3501 Function *ReductionFunc = *ReductionResult;
3502 Builder.restoreIP(CodeGenIP);
3503
3504 // Set the grid value in the config needed for lowering later on
3505 if (GridValue.has_value())
3506 Config.setGridValue(GridValue.value());
3507 else
3508 Config.setGridValue(getGridValue(T, ReductionFunc));
3509
3510 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3511 // RedList, shuffle_reduce_func, interwarp_copy_func);
3512 // or
3513 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3514 Value *Res;
3515
3516 // 1. Build a list of reduction variables.
3517 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3518 auto Size = ReductionInfos.size();
3519 Type *PtrTy = PointerType::getUnqual(Ctx);
3520 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3521 CodeGenIP = Builder.saveIP();
3522 Builder.restoreIP(AllocaIP);
3523 Value *ReductionListAlloca =
3524 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3526 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3527 Builder.restoreIP(CodeGenIP);
3528 Type *IndexTy = Builder.getIndexTy(
3530 for (auto En : enumerate(ReductionInfos)) {
3531 const ReductionInfo &RI = En.value();
3532 Value *ElemPtr = Builder.CreateInBoundsGEP(
3533 RedArrayTy, ReductionList,
3534 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3535 Value *CastElem =
3537 Builder.CreateStore(CastElem, ElemPtr);
3538 }
3539 CodeGenIP = Builder.saveIP();
3540 Function *SarFunc =
3541 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3542 Expected<Function *> CopyResult =
3543 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3544 if (!CopyResult)
3545 return CopyResult.takeError();
3546 Function *WcFunc = *CopyResult;
3547 Builder.restoreIP(CodeGenIP);
3548
3549 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3550
3551 unsigned MaxDataSize = 0;
3552 SmallVector<Type *> ReductionTypeArgs;
3553 for (auto En : enumerate(ReductionInfos)) {
3554 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3555 if (Size > MaxDataSize)
3556 MaxDataSize = Size;
3557 ReductionTypeArgs.emplace_back(En.value().ElementType);
3558 }
3559 Value *ReductionDataSize =
3560 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3561 if (!IsTeamsReduction) {
3562 Value *SarFuncCast =
3564 Value *WcFuncCast =
3566 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3567 WcFuncCast};
3569 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3570 Res = Builder.CreateCall(Pv2Ptr, Args);
3571 } else {
3572 CodeGenIP = Builder.saveIP();
3573 StructType *ReductionsBufferTy = StructType::create(
3574 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3575 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3576 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3577 Function *LtGCFunc = emitListToGlobalCopyFunction(
3578 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3579 Function *LtGRFunc = emitListToGlobalReduceFunction(
3580 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3581 Function *GtLCFunc = emitGlobalToListCopyFunction(
3582 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3583 Function *GtLRFunc = emitGlobalToListReduceFunction(
3584 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3585 Builder.restoreIP(CodeGenIP);
3586
3587 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3588 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3589
3590 Value *Args3[] = {SrcLocInfo,
3591 KernelTeamsReductionPtr,
3592 Builder.getInt32(ReductionBufNum),
3593 ReductionDataSize,
3594 RL,
3595 SarFunc,
3596 WcFunc,
3597 LtGCFunc,
3598 LtGRFunc,
3599 GtLCFunc,
3600 GtLRFunc};
3601
3602 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3603 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3604 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3605 }
3606
3607 // 5. Build if (res == 1)
3608 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3609 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3611 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3612
3613 // 6. Build then branch: where we have reduced values in the master
3614 // thread in each team.
3615 // __kmpc_end_reduce{_nowait}(<gtid>);
3616 // break;
3617 emitBlock(ThenBB, CurFunc);
3618
3619 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3620 for (auto En : enumerate(ReductionInfos)) {
3621 const ReductionInfo &RI = En.value();
3622 Value *LHS = RI.Variable;
3623 Value *RHS =
3625
3627 Value *LHSPtr, *RHSPtr;
3629 &LHSPtr, &RHSPtr, CurFunc));
3630
3631 // Fix the CallBack code genereated to use the correct Values for the LHS
3632 // and RHS
3633 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3634 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3635 ReductionFunc;
3636 });
3637 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3638 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3639 ReductionFunc;
3640 });
3641 } else {
3642 assert(false && "Unhandled ReductionGenCBKind");
3643 }
3644 }
3645 emitBlock(ExitBB, CurFunc);
3646
3648
3649 return Builder.saveIP();
3650}
3651
3653 Type *VoidTy = Type::getVoidTy(M.getContext());
3654 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3655 auto *FuncTy =
3656 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3658 ".omp.reduction.func", &M);
3659}
3660
3663 InsertPointTy AllocaIP,
3664 ArrayRef<ReductionInfo> ReductionInfos,
3665 ArrayRef<bool> IsByRef, bool IsNoWait) {
3666 assert(ReductionInfos.size() == IsByRef.size());
3667 for (const ReductionInfo &RI : ReductionInfos) {
3668 (void)RI;
3669 assert(RI.Variable && "expected non-null variable");
3670 assert(RI.PrivateVariable && "expected non-null private variable");
3671 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3672 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3673 "expected variables and their private equivalents to have the same "
3674 "type");
3675 assert(RI.Variable->getType()->isPointerTy() &&
3676 "expected variables to be pointers");
3677 }
3678
3679 if (!updateToLocation(Loc))
3680 return InsertPointTy();
3681
3682 BasicBlock *InsertBlock = Loc.IP.getBlock();
3683 BasicBlock *ContinuationBlock =
3684 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3685 InsertBlock->getTerminator()->eraseFromParent();
3686
3687 // Create and populate array of type-erased pointers to private reduction
3688 // values.
3689 unsigned NumReductions = ReductionInfos.size();
3690 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3692 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3693
3694 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3695
3696 for (auto En : enumerate(ReductionInfos)) {
3697 unsigned Index = En.index();
3698 const ReductionInfo &RI = En.value();
3699 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3700 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3701 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3702 }
3703
3704 // Emit a call to the runtime function that orchestrates the reduction.
3705 // Declare the reduction function in the process.
3707 Module *Module = Func->getParent();
3708 uint32_t SrcLocStrSize;
3709 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3710 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3711 return RI.AtomicReductionGen;
3712 });
3713 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3714 CanGenerateAtomic
3715 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3716 : IdentFlag(0));
3717 Value *ThreadId = getOrCreateThreadID(Ident);
3718 Constant *NumVariables = Builder.getInt32(NumReductions);
3719 const DataLayout &DL = Module->getDataLayout();
3720 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3721 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3722 Function *ReductionFunc = getFreshReductionFunc(*Module);
3723 Value *Lock = getOMPCriticalRegionLock(".reduction");
3725 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3726 : RuntimeFunction::OMPRTL___kmpc_reduce);
3727 CallInst *ReduceCall =
3728 Builder.CreateCall(ReduceFunc,
3729 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3730 ReductionFunc, Lock},
3731 "reduce");
3732
3733 // Create final reduction entry blocks for the atomic and non-atomic case.
3734 // Emit IR that dispatches control flow to one of the blocks based on the
3735 // reduction supporting the atomic mode.
3736 BasicBlock *NonAtomicRedBlock =
3737 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3738 BasicBlock *AtomicRedBlock =
3739 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3740 SwitchInst *Switch =
3741 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3742 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3743 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3744
3745 // Populate the non-atomic reduction using the elementwise reduction function.
3746 // This loads the elements from the global and private variables and reduces
3747 // them before storing back the result to the global variable.
3748 Builder.SetInsertPoint(NonAtomicRedBlock);
3749 for (auto En : enumerate(ReductionInfos)) {
3750 const ReductionInfo &RI = En.value();
3752 // We have one less load for by-ref case because that load is now inside of
3753 // the reduction region
3754 Value *RedValue = RI.Variable;
3755 if (!IsByRef[En.index()]) {
3756 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3757 "red.value." + Twine(En.index()));
3758 }
3759 Value *PrivateRedValue =
3761 "red.private.value." + Twine(En.index()));
3762 Value *Reduced;
3763 InsertPointOrErrorTy AfterIP =
3764 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3765 if (!AfterIP)
3766 return AfterIP.takeError();
3767 Builder.restoreIP(*AfterIP);
3768
3769 if (!Builder.GetInsertBlock())
3770 return InsertPointTy();
3771 // for by-ref case, the load is inside of the reduction region
3772 if (!IsByRef[En.index()])
3773 Builder.CreateStore(Reduced, RI.Variable);
3774 }
3775 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3776 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3777 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3778 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3779 Builder.CreateBr(ContinuationBlock);
3780
3781 // Populate the atomic reduction using the atomic elementwise reduction
3782 // function. There are no loads/stores here because they will be happening
3783 // inside the atomic elementwise reduction.
3784 Builder.SetInsertPoint(AtomicRedBlock);
3785 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3786 for (const ReductionInfo &RI : ReductionInfos) {
3789 if (!AfterIP)
3790 return AfterIP.takeError();
3791 Builder.restoreIP(*AfterIP);
3792 if (!Builder.GetInsertBlock())
3793 return InsertPointTy();
3794 }
3795 Builder.CreateBr(ContinuationBlock);
3796 } else {
3798 }
3799
3800 // Populate the outlined reduction function using the elementwise reduction
3801 // function. Partial values are extracted from the type-erased array of
3802 // pointers to private variables.
3803 BasicBlock *ReductionFuncBlock =
3804 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3805 Builder.SetInsertPoint(ReductionFuncBlock);
3806 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3807 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3808
3809 for (auto En : enumerate(ReductionInfos)) {
3810 const ReductionInfo &RI = En.value();
3812 RedArrayTy, LHSArrayPtr, 0, En.index());
3813 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3814 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3815 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3817 RedArrayTy, RHSArrayPtr, 0, En.index());
3818 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3819 Value *RHSPtr =
3821 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3822 Value *Reduced;
3823 InsertPointOrErrorTy AfterIP =
3824 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3825 if (!AfterIP)
3826 return AfterIP.takeError();
3827 Builder.restoreIP(*AfterIP);
3828 if (!Builder.GetInsertBlock())
3829 return InsertPointTy();
3830 // store is inside of the reduction region when using by-ref
3831 if (!IsByRef[En.index()])
3832 Builder.CreateStore(Reduced, LHSPtr);
3833 }
3835
3836 Builder.SetInsertPoint(ContinuationBlock);
3837 return Builder.saveIP();
3838}
3839
3842 BodyGenCallbackTy BodyGenCB,
3843 FinalizeCallbackTy FiniCB) {
3844 if (!updateToLocation(Loc))
3845 return Loc.IP;
3846
3847 Directive OMPD = Directive::OMPD_master;
3848 uint32_t SrcLocStrSize;
3849 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3850 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3851 Value *ThreadId = getOrCreateThreadID(Ident);
3852 Value *Args[] = {Ident, ThreadId};
3853
3854 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3855 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3856
3857 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3858 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3859
3860 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3861 /*Conditional*/ true, /*hasFinalize*/ true);
3862}
3863
3866 BodyGenCallbackTy BodyGenCB,
3867 FinalizeCallbackTy FiniCB, Value *Filter) {
3868 if (!updateToLocation(Loc))
3869 return Loc.IP;
3870
3871 Directive OMPD = Directive::OMPD_masked;
3872 uint32_t SrcLocStrSize;
3873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3874 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3875 Value *ThreadId = getOrCreateThreadID(Ident);
3876 Value *Args[] = {Ident, ThreadId, Filter};
3877 Value *ArgsEnd[] = {Ident, ThreadId};
3878
3879 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3880 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3881
3882 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3883 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3884
3885 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3886 /*Conditional*/ true, /*hasFinalize*/ true);
3887}
3888
3890 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3891 BasicBlock *PostInsertBefore, const Twine &Name) {
3892 Module *M = F->getParent();
3893 LLVMContext &Ctx = M->getContext();
3894 Type *IndVarTy = TripCount->getType();
3895
3896 // Create the basic block structure.
3897 BasicBlock *Preheader =
3898 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3899 BasicBlock *Header =
3900 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3901 BasicBlock *Cond =
3902 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3903 BasicBlock *Body =
3904 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3905 BasicBlock *Latch =
3906 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3907 BasicBlock *Exit =
3908 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3909 BasicBlock *After =
3910 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3911
3912 // Use specified DebugLoc for new instructions.
3914
3915 Builder.SetInsertPoint(Preheader);
3916 Builder.CreateBr(Header);
3917
3918 Builder.SetInsertPoint(Header);
3919 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3920 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3922
3924 Value *Cmp =
3925 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3926 Builder.CreateCondBr(Cmp, Body, Exit);
3927
3928 Builder.SetInsertPoint(Body);
3929 Builder.CreateBr(Latch);
3930
3931 Builder.SetInsertPoint(Latch);
3932 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3933 "omp_" + Name + ".next", /*HasNUW=*/true);
3934 Builder.CreateBr(Header);
3935 IndVarPHI->addIncoming(Next, Latch);
3936
3937 Builder.SetInsertPoint(Exit);
3939
3940 // Remember and return the canonical control flow.
3941 LoopInfos.emplace_front();
3942 CanonicalLoopInfo *CL = &LoopInfos.front();
3943
3944 CL->Header = Header;
3945 CL->Cond = Cond;
3946 CL->Latch = Latch;
3947 CL->Exit = Exit;
3948
3949#ifndef NDEBUG
3950 CL->assertOK();
3951#endif
3952 return CL;
3953}
3954
3957 LoopBodyGenCallbackTy BodyGenCB,
3958 Value *TripCount, const Twine &Name) {
3959 BasicBlock *BB = Loc.IP.getBlock();
3960 BasicBlock *NextBB = BB->getNextNode();
3961
3962 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3963 NextBB, NextBB, Name);
3964 BasicBlock *After = CL->getAfter();
3965
3966 // If location is not set, don't connect the loop.
3967 if (updateToLocation(Loc)) {
3968 // Split the loop at the insertion point: Branch to the preheader and move
3969 // every following instruction to after the loop (the After BB). Also, the
3970 // new successor is the loop's after block.
3971 spliceBB(Builder, After, /*CreateBranch=*/false);
3973 }
3974
3975 // Emit the body content. We do it after connecting the loop to the CFG to
3976 // avoid that the callback encounters degenerate BBs.
3977 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
3978 return Err;
3979
3980#ifndef NDEBUG
3981 CL->assertOK();
3982#endif
3983 return CL;
3984}
3985
3987 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3988 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3989 InsertPointTy ComputeIP, const Twine &Name) {
3990
3991 // Consider the following difficulties (assuming 8-bit signed integers):
3992 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3993 // DO I = 1, 100, 50
3994 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3995 // DO I = 100, 0, -128
3996
3997 // Start, Stop and Step must be of the same integer type.
3998 auto *IndVarTy = cast<IntegerType>(Start->getType());
3999 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4000 assert(IndVarTy == Step->getType() && "Step type mismatch");
4001
4002 LocationDescription ComputeLoc =
4003 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4004 updateToLocation(ComputeLoc);
4005
4006 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4007 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4008
4009 // Like Step, but always positive.
4010 Value *Incr = Step;
4011
4012 // Distance between Start and Stop; always positive.
4013 Value *Span;
4014
4015 // Condition whether there are no iterations are executed at all, e.g. because
4016 // UB < LB.
4017 Value *ZeroCmp;
4018
4019 if (IsSigned) {
4020 // Ensure that increment is positive. If not, negate and invert LB and UB.
4021 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4022 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4023 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4024 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4025 Span = Builder.CreateSub(UB, LB, "", false, true);
4026 ZeroCmp = Builder.CreateICmp(
4027 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4028 } else {
4029 Span = Builder.CreateSub(Stop, Start, "", true);
4030 ZeroCmp = Builder.CreateICmp(
4031 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4032 }
4033
4034 Value *CountIfLooping;
4035 if (InclusiveStop) {
4036 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4037 } else {
4038 // Avoid incrementing past stop since it could overflow.
4039 Value *CountIfTwo = Builder.CreateAdd(
4040 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4041 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4042 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4043 }
4044 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4045 "omp_" + Name + ".tripcount");
4046
4047 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4048 Builder.restoreIP(CodeGenIP);
4049 Value *Span = Builder.CreateMul(IV, Step);
4050 Value *IndVar = Builder.CreateAdd(Span, Start);
4051 return BodyGenCB(Builder.saveIP(), IndVar);
4052 };
4053 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4054 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4055}
4056
4057// Returns an LLVM function to call for initializing loop bounds using OpenMP
4058// static scheduling depending on `type`. Only i32 and i64 are supported by the
4059// runtime. Always interpret integers as unsigned similarly to
4060// CanonicalLoopInfo.
4062 OpenMPIRBuilder &OMPBuilder) {
4063 unsigned Bitwidth = Ty->getIntegerBitWidth();
4064 if (Bitwidth == 32)
4065 return OMPBuilder.getOrCreateRuntimeFunction(
4066 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4067 if (Bitwidth == 64)
4068 return OMPBuilder.getOrCreateRuntimeFunction(
4069 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4070 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4071}
4072
4074OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4075 InsertPointTy AllocaIP,
4076 bool NeedsBarrier) {
4077 assert(CLI->isValid() && "Requires a valid canonical loop");
4078 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4079 "Require dedicated allocate IP");
4080
4081 // Set up the source location value for OpenMP runtime.
4084
4085 uint32_t SrcLocStrSize;
4086 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4087 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4088
4089 // Declare useful OpenMP runtime functions.
4090 Value *IV = CLI->getIndVar();
4091 Type *IVTy = IV->getType();
4092 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4093 FunctionCallee StaticFini =
4094 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4095
4096 // Allocate space for computed loop bounds as expected by the "init" function.
4097 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4098
4099 Type *I32Type = Type::getInt32Ty(M.getContext());
4100 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4101 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4102 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4103 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4104
4105 // At the end of the preheader, prepare for calling the "init" function by
4106 // storing the current loop bounds into the allocated space. A canonical loop
4107 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4108 // and produces an inclusive upper bound.
4110 Constant *Zero = ConstantInt::get(IVTy, 0);
4111 Constant *One = ConstantInt::get(IVTy, 1);
4112 Builder.CreateStore(Zero, PLowerBound);
4113 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4114 Builder.CreateStore(UpperBound, PUpperBound);
4115 Builder.CreateStore(One, PStride);
4116
4117 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4118
4119 Constant *SchedulingType = ConstantInt::get(
4120 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4121
4122 // Call the "init" function and update the trip count of the loop with the
4123 // value it produced.
4124 Builder.CreateCall(StaticInit,
4125 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4126 PUpperBound, PStride, One, Zero});
4127 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4128 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4129 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4130 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4131 CLI->setTripCount(TripCount);
4132
4133 // Update all uses of the induction variable except the one in the condition
4134 // block that compares it with the actual upper bound, and the increment in
4135 // the latch block.
4136
4137 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4139 CLI->getBody()->getFirstInsertionPt());
4141 return Builder.CreateAdd(OldIV, LowerBound);
4142 });
4143
4144 // In the "exit" block, call the "fini" function.
4146 CLI->getExit()->getTerminator()->getIterator());
4147 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4148
4149 // Add the barrier if requested.
4150 if (NeedsBarrier) {
4151 InsertPointOrErrorTy BarrierIP =
4152 createBarrier(LocationDescription(Builder.saveIP(), DL),
4153 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4154 /* CheckCancelFlag */ false);
4155 if (!BarrierIP)
4156 return BarrierIP.takeError();
4157 }
4158
4159 InsertPointTy AfterIP = CLI->getAfterIP();
4160 CLI->invalidate();
4161
4162 return AfterIP;
4163}
4164
4166OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4167 CanonicalLoopInfo *CLI,
4168 InsertPointTy AllocaIP,
4169 bool NeedsBarrier,
4170 Value *ChunkSize) {
4171 assert(CLI->isValid() && "Requires a valid canonical loop");
4172 assert(ChunkSize && "Chunk size is required");
4173
4174 LLVMContext &Ctx = CLI->getFunction()->getContext();
4175 Value *IV = CLI->getIndVar();
4176 Value *OrigTripCount = CLI->getTripCount();
4177 Type *IVTy = IV->getType();
4178 assert(IVTy->getIntegerBitWidth() <= 64 &&
4179 "Max supported tripcount bitwidth is 64 bits");
4180 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4181 : Type::getInt64Ty(Ctx);
4182 Type *I32Type = Type::getInt32Ty(M.getContext());
4183 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4184 Constant *One = ConstantInt::get(InternalIVTy, 1);
4185
4186 // Declare useful OpenMP runtime functions.
4187 FunctionCallee StaticInit =
4188 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4189 FunctionCallee StaticFini =
4190 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4191
4192 // Allocate space for computed loop bounds as expected by the "init" function.
4193 Builder.restoreIP(AllocaIP);
4195 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4196 Value *PLowerBound =
4197 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4198 Value *PUpperBound =
4199 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4200 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4201
4202 // Set up the source location value for the OpenMP runtime.
4205
4206 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4207 Value *CastedChunkSize =
4208 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4209 Value *CastedTripCount =
4210 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4211
4212 Constant *SchedulingType = ConstantInt::get(
4213 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4214 Builder.CreateStore(Zero, PLowerBound);
4215 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4216 Builder.CreateStore(OrigUpperBound, PUpperBound);
4217 Builder.CreateStore(One, PStride);
4218
4219 // Call the "init" function and update the trip count of the loop with the
4220 // value it produced.
4221 uint32_t SrcLocStrSize;
4222 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4223 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4224 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4225 Builder.CreateCall(StaticInit,
4226 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4227 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4228 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4229 /*pstride=*/PStride, /*incr=*/One,
4230 /*chunk=*/CastedChunkSize});
4231
4232 // Load values written by the "init" function.
4233 Value *FirstChunkStart =
4234 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4235 Value *FirstChunkStop =
4236 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4237 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4238 Value *ChunkRange =
4239 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4240 Value *NextChunkStride =
4241 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4242
4243 // Create outer "dispatch" loop for enumerating the chunks.
4244 BasicBlock *DispatchEnter = splitBB(Builder, true);
4245 Value *DispatchCounter;
4247 {Builder.saveIP(), DL},
4248 [&](InsertPointTy BodyIP, Value *Counter) {
4249 DispatchCounter = Counter;
4250 return Error::success();
4251 },
4252 FirstChunkStart, CastedTripCount, NextChunkStride,
4253 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4254 "dispatch");
4255 if (!LoopResult) {
4256 // It is safe to assume this didn't return an error because the callback
4257 // passed into createCanonicalLoop is the only possible error source, and it
4258 // always returns success. Need to still cast the result into bool to avoid
4259 // runtime errors.
4260 llvm_unreachable("unexpected error creating canonical loop");
4261 }
4262 CanonicalLoopInfo *DispatchCLI = *LoopResult;
4263
4264 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4265 // not have to preserve the canonical invariant.
4266 BasicBlock *DispatchBody = DispatchCLI->getBody();
4267 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4268 BasicBlock *DispatchExit = DispatchCLI->getExit();
4269 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4270 DispatchCLI->invalidate();
4271
4272 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4273 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4274 redirectTo(CLI->getExit(), DispatchLatch, DL);
4275 redirectTo(DispatchBody, DispatchEnter, DL);
4276
4277 // Prepare the prolog of the chunk loop.
4280
4281 // Compute the number of iterations of the chunk loop.
4283 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4284 Value *IsLastChunk =
4285 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4286 Value *CountUntilOrigTripCount =
4287 Builder.CreateSub(CastedTripCount, DispatchCounter);
4288 Value *ChunkTripCount = Builder.CreateSelect(
4289 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4290 Value *BackcastedChunkTC =
4291 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4292 CLI->setTripCount(BackcastedChunkTC);
4293
4294 // Update all uses of the induction variable except the one in the condition
4295 // block that compares it with the actual upper bound, and the increment in
4296 // the latch block.
4297 Value *BackcastedDispatchCounter =
4298 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4299 CLI->mapIndVar([&](Instruction *) -> Value * {
4300 Builder.restoreIP(CLI->getBodyIP());
4301 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4302 });
4303
4304 // In the "exit" block, call the "fini" function.
4305 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4306 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4307
4308 // Add the barrier if requested.
4309 if (NeedsBarrier) {
4310 InsertPointOrErrorTy AfterIP =
4311 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4312 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4313 if (!AfterIP)
4314 return AfterIP.takeError();
4315 }
4316
4317#ifndef NDEBUG
4318 // Even though we currently do not support applying additional methods to it,
4319 // the chunk loop should remain a canonical loop.
4320 CLI->assertOK();
4321#endif
4322
4323 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4324}
4325
4326// Returns an LLVM function to call for executing an OpenMP static worksharing
4327// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4328// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4329static FunctionCallee
4331 WorksharingLoopType LoopType) {
4332 unsigned Bitwidth = Ty->getIntegerBitWidth();
4333 Module &M = OMPBuilder->M;
4334 switch (LoopType) {
4335 case WorksharingLoopType::ForStaticLoop:
4336 if (Bitwidth == 32)
4337 return OMPBuilder->getOrCreateRuntimeFunction(
4338 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4339 if (Bitwidth == 64)
4340 return OMPBuilder->getOrCreateRuntimeFunction(
4341 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4342 break;
4343 case WorksharingLoopType::DistributeStaticLoop:
4344 if (Bitwidth == 32)
4345 return OMPBuilder->getOrCreateRuntimeFunction(
4346 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4347 if (Bitwidth == 64)
4348 return OMPBuilder->getOrCreateRuntimeFunction(
4349 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4350 break;
4351 case WorksharingLoopType::DistributeForStaticLoop:
4352 if (Bitwidth == 32)
4353 return OMPBuilder->getOrCreateRuntimeFunction(
4354 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4355 if (Bitwidth == 64)
4356 return OMPBuilder->getOrCreateRuntimeFunction(
4357 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4358 break;
4359 }
4360 if (Bitwidth != 32 && Bitwidth != 64) {
4361 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4362 }
4363 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4364}
4365
4366// Inserts a call to proper OpenMP Device RTL function which handles
4367// loop worksharing.
4369 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4370 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4371 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4372 Type *TripCountTy = TripCount->getType();
4373 Module &M = OMPBuilder->M;
4374 IRBuilder<> &Builder = OMPBuilder->Builder;
4375 FunctionCallee RTLFn =
4376 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4377 SmallVector<Value *, 8> RealArgs;
4378 RealArgs.push_back(Ident);
4379 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4380 RealArgs.push_back(LoopBodyArg);
4381 RealArgs.push_back(TripCount);
4382 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4383 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4384 Builder.CreateCall(RTLFn, RealArgs);
4385 return;
4386 }
4387 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4388 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4389 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4390 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4391
4392 RealArgs.push_back(
4393 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4394 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4395 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4396 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4397 }
4398
4399 Builder.CreateCall(RTLFn, RealArgs);
4400}
4401
4402static void
4404 CanonicalLoopInfo *CLI, Value *Ident,
4405 Function &OutlinedFn, Type *ParallelTaskPtr,
4406 const SmallVector<Instruction *, 4> &ToBeDeleted,
4407 WorksharingLoopType LoopType) {
4408 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4409 BasicBlock *Preheader = CLI->getPreheader();
4410 Value *TripCount = CLI->getTripCount();
4411
4412 // After loop body outling, the loop body contains only set up
4413 // of loop body argument structure and the call to the outlined
4414 // loop body function. Firstly, we need to move setup of loop body args
4415 // into loop preheader.
4416 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4417 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4418
4419 // The next step is to remove the whole loop. We do not it need anymore.
4420 // That's why make an unconditional branch from loop preheader to loop
4421 // exit block
4422 Builder.restoreIP({Preheader, Preheader->end()});
4423 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4424 Preheader->getTerminator()->eraseFromParent();
4425 Builder.CreateBr(CLI->getExit());
4426
4427 // Delete dead loop blocks
4428 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4429 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4430 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4431 CleanUpInfo.EntryBB = CLI->getHeader();
4432 CleanUpInfo.ExitBB = CLI->getExit();
4433 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4434 DeleteDeadBlocks(BlocksToBeRemoved);
4435
4436 // Find the instruction which corresponds to loop body argument structure
4437 // and remove the call to loop body function instruction.
4438 Value *LoopBodyArg;
4439 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4440 assert(OutlinedFnUser &&
4441 "Expected unique undroppable user of outlined function");
4442 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4443 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4444 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4445 "Expected outlined function call to be located in loop preheader");
4446 // Check in case no argument structure has been passed.
4447 if (OutlinedFnCallInstruction->arg_size() > 1)
4448 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4449 else
4450 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4451 OutlinedFnCallInstruction->eraseFromParent();
4452
4453 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4454 LoopBodyArg, ParallelTaskPtr, TripCount,
4455 OutlinedFn);
4456
4457 for (auto &ToBeDeletedItem : ToBeDeleted)
4458 ToBeDeletedItem->eraseFromParent();
4459 CLI->invalidate();
4460}
4461
4463OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4464 InsertPointTy AllocaIP,
4465 WorksharingLoopType LoopType) {
4466 uint32_t SrcLocStrSize;
4467 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4468 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4469
4470 OutlineInfo OI;
4471 OI.OuterAllocaBB = CLI->getPreheader();
4472 Function *OuterFn = CLI->getPreheader()->getParent();
4473
4474 // Instructions which need to be deleted at the end of code generation
4476
4477 OI.OuterAllocaBB = AllocaIP.getBlock();
4478
4479 // Mark the body loop as region which needs to be extracted
4480 OI.EntryBB = CLI->getBody();
4481 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4482 "omp.prelatch", true);
4483
4484 // Prepare loop body for extraction
4485 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4486
4487 // Insert new loop counter variable which will be used only in loop
4488 // body.
4489 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4490 Instruction *NewLoopCntLoad =
4491 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4492 // New loop counter instructions are redundant in the loop preheader when
4493 // code generation for workshare loop is finshed. That's why mark them as
4494 // ready for deletion.
4495 ToBeDeleted.push_back(NewLoopCntLoad);
4496 ToBeDeleted.push_back(NewLoopCnt);
4497
4498 // Analyse loop body region. Find all input variables which are used inside
4499 // loop body region.
4500 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4502 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4503 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4504 ParallelRegionBlockSet.end());
4505
4506 CodeExtractorAnalysisCache CEAC(*OuterFn);
4507 CodeExtractor Extractor(Blocks,
4508 /* DominatorTree */ nullptr,
4509 /* AggregateArgs */ true,
4510 /* BlockFrequencyInfo */ nullptr,
4511 /* BranchProbabilityInfo */ nullptr,
4512 /* AssumptionCache */ nullptr,
4513 /* AllowVarArgs */ true,
4514 /* AllowAlloca */ true,
4515 /* AllocationBlock */ CLI->getPreheader(),
4516 /* Suffix */ ".omp_wsloop",
4517 /* AggrArgsIn0AddrSpace */ true);
4518
4519 BasicBlock *CommonExit = nullptr;
4520 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4521
4522 // Find allocas outside the loop body region which are used inside loop
4523 // body
4524 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4525
4526 // We need to model loop body region as the function f(cnt, loop_arg).
4527 // That's why we replace loop induction variable by the new counter
4528 // which will be one of loop body function argument
4530 CLI->getIndVar()->user_end());
4531 for (auto Use : Users) {
4532 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4533 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4534 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4535 }
4536 }
4537 }
4538 // Make sure that loop counter variable is not merged into loop body
4539 // function argument structure and it is passed as separate variable
4540 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4541
4542 // PostOutline CB is invoked when loop body function is outlined and
4543 // loop body is replaced by call to outlined function. We need to add
4544 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4545 // function will handle loop control logic.
4546 //
4547 OI.PostOutlineCB = [=, ToBeDeletedVec =
4548 std::move(ToBeDeleted)](Function &OutlinedFn) {
4549 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4550 ToBeDeletedVec, LoopType);
4551 };
4552 addOutlineInfo(std::move(OI));
4553 return CLI->getAfterIP();
4554}
4555
4558 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4559 bool HasSimdModifier, bool HasMonotonicModifier,
4560 bool HasNonmonotonicModifier, bool HasOrderedClause,
4561 WorksharingLoopType LoopType) {
4562 if (Config.isTargetDevice())
4563 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4564 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4565 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4566 HasNonmonotonicModifier, HasOrderedClause);
4567
4568 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4569 OMPScheduleType::ModifierOrdered;
4570 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4571 case OMPScheduleType::BaseStatic:
4572 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4573 if (IsOrdered)
4574 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4575 NeedsBarrier, ChunkSize);
4576 // FIXME: Monotonicity ignored?
4577 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4578
4579 case OMPScheduleType::BaseStaticChunked:
4580 if (IsOrdered)
4581 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4582 NeedsBarrier, ChunkSize);
4583 // FIXME: Monotonicity ignored?
4584 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4585 ChunkSize);
4586
4587 case OMPScheduleType::BaseRuntime:
4588 case OMPScheduleType::BaseAuto:
4589 case OMPScheduleType::BaseGreedy:
4590 case OMPScheduleType::BaseBalanced:
4591 case OMPScheduleType::BaseSteal:
4592 case OMPScheduleType::BaseGuidedSimd:
4593 case OMPScheduleType::BaseRuntimeSimd:
4594 assert(!ChunkSize &&
4595 "schedule type does not support user-defined chunk sizes");
4596 [[fallthrough]];
4597 case OMPScheduleType::BaseDynamicChunked:
4598 case OMPScheduleType::BaseGuidedChunked:
4599 case OMPScheduleType::BaseGuidedIterativeChunked:
4600 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4601 case OMPScheduleType::BaseStaticBalancedChunked:
4602 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4603 NeedsBarrier, ChunkSize);
4604
4605 default:
4606 llvm_unreachable("Unknown/unimplemented schedule kind");
4607 }
4608}
4609
4610/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4611/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4612/// the runtime. Always interpret integers as unsigned similarly to
4613/// CanonicalLoopInfo.
4614static FunctionCallee
4616 unsigned Bitwidth = Ty->getIntegerBitWidth();
4617 if (Bitwidth == 32)
4618 return OMPBuilder.getOrCreateRuntimeFunction(
4619 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4620 if (Bitwidth == 64)
4621 return OMPBuilder.getOrCreateRuntimeFunction(
4622 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4623 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4624}
4625
4626/// Returns an LLVM function to call for updating the next loop using OpenMP
4627/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4628/// the runtime. Always interpret integers as unsigned similarly to
4629/// CanonicalLoopInfo.
4630static FunctionCallee
4632 unsigned Bitwidth = Ty->getIntegerBitWidth();
4633 if (Bitwidth == 32)
4634 return OMPBuilder.getOrCreateRuntimeFunction(
4635 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4636 if (Bitwidth == 64)
4637 return OMPBuilder.getOrCreateRuntimeFunction(
4638 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4639 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4640}
4641
4642/// Returns an LLVM function to call for finalizing the dynamic loop using
4643/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4644/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4645static FunctionCallee
4647 unsigned Bitwidth = Ty->getIntegerBitWidth();
4648 if (Bitwidth == 32)
4649 return OMPBuilder.getOrCreateRuntimeFunction(
4650 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4651 if (Bitwidth == 64)
4652 return OMPBuilder.getOrCreateRuntimeFunction(
4653 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4654 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4655}
4656
4658OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4659 InsertPointTy AllocaIP,
4660 OMPScheduleType SchedType,
4661 bool NeedsBarrier, Value *Chunk) {
4662 assert(CLI->isValid() && "Requires a valid canonical loop");
4663 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4664 "Require dedicated allocate IP");
4666 "Require valid schedule type");
4667
4668 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4669 OMPScheduleType::ModifierOrdered;
4670
4671 // Set up the source location value for OpenMP runtime.
4673
4674 uint32_t SrcLocStrSize;
4675 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4676 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4677
4678 // Declare useful OpenMP runtime functions.
4679 Value *IV = CLI->getIndVar();
4680 Type *IVTy = IV->getType();
4681 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4682 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4683
4684 // Allocate space for computed loop bounds as expected by the "init" function.
4685 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4686 Type *I32Type = Type::getInt32Ty(M.getContext());
4687 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4688 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4689 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4690 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4691
4692 // At the end of the preheader, prepare for calling the "init" function by
4693 // storing the current loop bounds into the allocated space. A canonical loop
4694 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4695 // and produces an inclusive upper bound.
4696 BasicBlock *PreHeader = CLI->getPreheader();
4697 Builder.SetInsertPoint(PreHeader->getTerminator());
4698 Constant *One = ConstantInt::get(IVTy, 1);
4699 Builder.CreateStore(One, PLowerBound);
4700 Value *UpperBound = CLI->getTripCount();
4701 Builder.CreateStore(UpperBound, PUpperBound);
4702 Builder.CreateStore(One, PStride);
4703
4704 BasicBlock *Header = CLI->getHeader();
4705 BasicBlock *Exit = CLI->getExit();
4706 BasicBlock *Cond = CLI->getCond();
4707 BasicBlock *Latch = CLI->getLatch();
4708 InsertPointTy AfterIP = CLI->getAfterIP();
4709
4710 // The CLI will be "broken" in the code below, as the loop is no longer
4711 // a valid canonical loop.
4712
4713 if (!Chunk)
4714 Chunk = One;
4715
4716 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4717
4718 Constant *SchedulingType =
4719 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4720
4721 // Call the "init" function.
4722 Builder.CreateCall(DynamicInit,
4723 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4724 UpperBound, /* step */ One, Chunk});
4725
4726 // An outer loop around the existing one.
4727 BasicBlock *OuterCond = BasicBlock::Create(
4728 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4729 PreHeader->getParent());
4730 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4731 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4732 Value *Res =
4733 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4734 PLowerBound, PUpperBound, PStride});
4735 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4736 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4737 Value *LowerBound =
4738 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4739 Builder.CreateCondBr(MoreWork, Header, Exit);
4740
4741 // Change PHI-node in loop header to use outer cond rather than preheader,
4742 // and set IV to the LowerBound.
4743 Instruction *Phi = &Header->front();
4744 auto *PI = cast<PHINode>(Phi);
4745 PI->setIncomingBlock(0, OuterCond);
4746 PI->setIncomingValue(0, LowerBound);
4747
4748 // Then set the pre-header to jump to the OuterCond
4749 Instruction *Term = PreHeader->getTerminator();
4750 auto *Br = cast<BranchInst>(Term);
4751 Br->setSuccessor(0, OuterCond);
4752
4753 // Modify the inner condition:
4754 // * Use the UpperBound returned from the DynamicNext call.
4755 // * jump to the loop outer loop when done with one of the inner loops.
4756 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4757 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4759 auto *CI = cast<CmpInst>(Comp);
4760 CI->setOperand(1, UpperBound);
4761 // Redirect the inner exit to branch to outer condition.
4762 Instruction *Branch = &Cond->back();
4763 auto *BI = cast<BranchInst>(Branch);
4764 assert(BI->getSuccessor(1) == Exit);
4765 BI->setSuccessor(1, OuterCond);
4766
4767 // Call the "fini" function if "ordered" is present in wsloop directive.
4768 if (Ordered) {
4769 Builder.SetInsertPoint(&Latch->back());
4770 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4771 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4772 }
4773
4774 // Add the barrier if requested.
4775 if (NeedsBarrier) {
4776 Builder.SetInsertPoint(&Exit->back());
4777 InsertPointOrErrorTy BarrierIP =
4778 createBarrier(LocationDescription(Builder.saveIP(), DL),
4779 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4780 /* CheckCancelFlag */ false);
4781 if (!BarrierIP)
4782 return BarrierIP.takeError();
4783 }
4784
4785 CLI->invalidate();
4786 return AfterIP;
4787}
4788
4789/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4790/// after this \p OldTarget will be orphaned.
4792 BasicBlock *NewTarget, DebugLoc DL) {
4793 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4794 redirectTo(Pred, NewTarget, DL);
4795}
4796
4797/// Determine which blocks in \p BBs are reachable from outside and remove the
4798/// ones that are not reachable from the function.
4800 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4801 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4802 for (Use &U : BB->uses()) {
4803 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4804 if (!UseInst)
4805 continue;
4806 if (BBsToErase.count(UseInst->getParent()))
4807 continue;
4808 return true;
4809 }
4810 return false;
4811 };
4812
4813 while (BBsToErase.remove_if(HasRemainingUses)) {
4814 // Try again if anything was removed.
4815 }
4816
4817 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4818 DeleteDeadBlocks(BBVec);
4819}
4820
4823 InsertPointTy ComputeIP) {
4824 assert(Loops.size() >= 1 && "At least one loop required");
4825 size_t NumLoops = Loops.size();
4826
4827 // Nothing to do if there is already just one loop.
4828 if (NumLoops == 1)
4829 return Loops.front();
4830
4831 CanonicalLoopInfo *Outermost = Loops.front();
4832 CanonicalLoopInfo *Innermost = Loops.back();
4833 BasicBlock *OrigPreheader = Outermost->getPreheader();
4834 BasicBlock *OrigAfter = Outermost->getAfter();
4835 Function *F = OrigPreheader->getParent();
4836
4837 // Loop control blocks that may become orphaned later.
4838 SmallVector<BasicBlock *, 12> OldControlBBs;
4839 OldControlBBs.reserve(6 * Loops.size());
4841 Loop->collectControlBlocks(OldControlBBs);
4842
4843 // Setup the IRBuilder for inserting the trip count computation.
4845 if (ComputeIP.isSet())
4846 Builder.restoreIP(ComputeIP);
4847 else
4848 Builder.restoreIP(Outermost->getPreheaderIP());
4849
4850 // Derive the collapsed' loop trip count.
4851 // TODO: Find common/largest indvar type.
4852 Value *CollapsedTripCount = nullptr;
4853 for (CanonicalLoopInfo *L : Loops) {
4854 assert(L->isValid() &&
4855 "All loops to collapse must be valid canonical loops");
4856 Value *OrigTripCount = L->getTripCount();
4857 if (!CollapsedTripCount) {
4858 CollapsedTripCount = OrigTripCount;
4859 continue;
4860 }
4861
4862 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4863 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4864 {}, /*HasNUW=*/true);
4865 }
4866
4867 // Create the collapsed loop control flow.
4868 CanonicalLoopInfo *Result =
4869 createLoopSkeleton(DL, CollapsedTripCount, F,
4870 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4871
4872 // Build the collapsed loop body code.
4873 // Start with deriving the input loop induction variables from the collapsed
4874 // one, using a divmod scheme. To preserve the original loops' order, the
4875 // innermost loop use the least significant bits.
4876 Builder.restoreIP(Result->getBodyIP());
4877
4878 Value *Leftover = Result->getIndVar();
4879 SmallVector<Value *> NewIndVars;
4880 NewIndVars.resize(NumLoops);
4881 for (int i = NumLoops - 1; i >= 1; --i) {
4882 Value *OrigTripCount = Loops[i]->getTripCount();
4883
4884 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4885 NewIndVars[i] = NewIndVar;
4886
4887 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4888 }
4889 // Outermost loop gets all the remaining bits.
4890 NewIndVars[0] = Leftover;
4891
4892 // Construct the loop body control flow.
4893 // We progressively construct the branch structure following in direction of
4894 // the control flow, from the leading in-between code, the loop nest body, the
4895 // trailing in-between code, and rejoining the collapsed loop's latch.
4896 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4897 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4898 // its predecessors as sources.
4899 BasicBlock *ContinueBlock = Result->getBody();
4900 BasicBlock *ContinuePred = nullptr;
4901 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4902 BasicBlock *NextSrc) {
4903 if (ContinueBlock)
4904 redirectTo(ContinueBlock, Dest, DL);
4905 else
4906 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4907
4908 ContinueBlock = nullptr;
4909 ContinuePred = NextSrc;
4910 };
4911
4912 // The code before the nested loop of each level.
4913 // Because we are sinking it into the nest, it will be executed more often
4914 // that the original loop. More sophisticated schemes could keep track of what
4915 // the in-between code is and instantiate it only once per thread.
4916 for (size_t i = 0; i < NumLoops - 1; ++i)
4917 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4918
4919 // Connect the loop nest body.
4920 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4921
4922 // The code after the nested loop at each level.
4923 for (size_t i = NumLoops - 1; i > 0; --i)
4924 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4925
4926 // Connect the finished loop to the collapsed loop latch.
4927 ContinueWith(Result->getLatch(), nullptr);
4928
4929 // Replace the input loops with the new collapsed loop.
4930 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4931 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4932
4933 // Replace the input loop indvars with the derived ones.
4934 for (size_t i = 0; i < NumLoops; ++i)
4935 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4936
4937 // Remove unused parts of the input loops.
4938 removeUnusedBlocksFromParent(OldControlBBs);
4939
4940 for (CanonicalLoopInfo *L : Loops)
4941 L->invalidate();
4942
4943#ifndef NDEBUG
4944 Result->assertOK();
4945#endif
4946 return Result;
4947}
4948
4949std::vector<CanonicalLoopInfo *>
4951 ArrayRef<Value *> TileSizes) {
4952 assert(TileSizes.size() == Loops.size() &&
4953 "Must pass as many tile sizes as there are loops");
4954 int NumLoops = Loops.size();
4955 assert(NumLoops >= 1 && "At least one loop to tile required");
4956
4957 CanonicalLoopInfo *OutermostLoop = Loops.front();
4958 CanonicalLoopInfo *InnermostLoop = Loops.back();
4959 Function *F = OutermostLoop->getBody()->getParent();
4960 BasicBlock *InnerEnter = InnermostLoop->getBody();
4961 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4962
4963 // Loop control blocks that may become orphaned later.
4964 SmallVector<BasicBlock *, 12> OldControlBBs;
4965 OldControlBBs.reserve(6 * Loops.size());
4967 Loop->collectControlBlocks(OldControlBBs);
4968
4969 // Collect original trip counts and induction variable to be accessible by
4970 // index. Also, the structure of the original loops is not preserved during
4971 // the construction of the tiled loops, so do it before we scavenge the BBs of
4972 // any original CanonicalLoopInfo.
4973 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4974 for (CanonicalLoopInfo *L : Loops) {
4975 assert(L->isValid() && "All input loops must be valid canonical loops");
4976 OrigTripCounts.push_back(L->getTripCount());
4977 OrigIndVars.push_back(L->getIndVar());
4978 }
4979
4980 // Collect the code between loop headers. These may contain SSA definitions
4981 // that are used in the loop nest body. To be usable with in the innermost
4982 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4983 // these instructions may be executed more often than before the tiling.
4984 // TODO: It would be sufficient to only sink them into body of the
4985 // corresponding tile loop.
4987 for (int i = 0; i < NumLoops - 1; ++i) {
4988 CanonicalLoopInfo *Surrounding = Loops[i];
4989 CanonicalLoopInfo *Nested = Loops[i + 1];
4990
4991 BasicBlock *EnterBB = Surrounding->getBody();
4992 BasicBlock *ExitBB = Nested->getHeader();
4993 InbetweenCode.emplace_back(EnterBB, ExitBB);
4994 }
4995
4996 // Compute the trip counts of the floor loops.
4998 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4999 SmallVector<Value *, 4> FloorCount, FloorRems;
5000 for (int i = 0; i < NumLoops; ++i) {
5001 Value *TileSize = TileSizes[i];
5002 Value *OrigTripCount = OrigTripCounts[i];
5003 Type *IVType = OrigTripCount->getType();
5004
5005 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5006 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5007
5008 // 0 if tripcount divides the tilesize, 1 otherwise.
5009 // 1 means we need an additional iteration for a partial tile.
5010 //
5011 // Unfortunately we cannot just use the roundup-formula
5012 // (tripcount + tilesize - 1)/tilesize
5013 // because the summation might overflow. We do not want introduce undefined
5014 // behavior when the untiled loop nest did not.
5015 Value *FloorTripOverflow =
5016 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5017
5018 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5019 FloorTripCount =
5020 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5021 "omp_floor" + Twine(i) + ".tripcount", true);
5022
5023 // Remember some values for later use.
5024 FloorCount.push_back(FloorTripCount);
5025 FloorRems.push_back(FloorTripRem);
5026 }
5027
5028 // Generate the new loop nest, from the outermost to the innermost.
5029 std::vector<CanonicalLoopInfo *> Result;
5030 Result.reserve(NumLoops * 2);
5031
5032 // The basic block of the surrounding loop that enters the nest generated
5033 // loop.
5034 BasicBlock *Enter = OutermostLoop->getPreheader();
5035
5036 // The basic block of the surrounding loop where the inner code should
5037 // continue.
5038 BasicBlock *Continue = OutermostLoop->getAfter();
5039
5040 // Where the next loop basic block should be inserted.
5041 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5042
5043 auto EmbeddNewLoop =
5044 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5045 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5046 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5047 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5048 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5049 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5050
5051 // Setup the position where the next embedded loop connects to this loop.
5052 Enter = EmbeddedLoop->getBody();
5053 Continue = EmbeddedLoop->getLatch();
5054 OutroInsertBefore = EmbeddedLoop->getLatch();
5055 return EmbeddedLoop;
5056 };
5057
5058 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5059 const Twine &NameBase) {
5060 for (auto P : enumerate(TripCounts)) {
5061 CanonicalLoopInfo *EmbeddedLoop =
5062 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5063 Result.push_back(EmbeddedLoop);
5064 }
5065 };
5066
5067 EmbeddNewLoops(FloorCount, "floor");
5068
5069 // Within the innermost floor loop, emit the code that computes the tile
5070 // sizes.
5072 SmallVector<Value *, 4> TileCounts;
5073 for (int i = 0; i < NumLoops; ++i) {
5074 CanonicalLoopInfo *FloorLoop = Result[i];
5075 Value *TileSize = TileSizes[i];
5076
5077 Value *FloorIsEpilogue =
5078 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5079 Value *TileTripCount =
5080 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5081
5082 TileCounts.push_back(TileTripCount);
5083 }
5084
5085 // Create the tile loops.
5086 EmbeddNewLoops(TileCounts, "tile");
5087
5088 // Insert the inbetween code into the body.
5089 BasicBlock *BodyEnter = Enter;
5090 BasicBlock *BodyEntered = nullptr;
5091 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5092 BasicBlock *EnterBB = P.first;
5093 BasicBlock *ExitBB = P.second;
5094
5095 if (BodyEnter)
5096 redirectTo(BodyEnter, EnterBB, DL);
5097 else
5098 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5099
5100 BodyEnter = nullptr;
5101 BodyEntered = ExitBB;
5102 }
5103
5104 // Append the original loop nest body into the generated loop nest body.
5105 if (BodyEnter)
5106 redirectTo(BodyEnter, InnerEnter, DL);
5107 else
5108 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5110
5111 // Replace the original induction variable with an induction variable computed
5112 // from the tile and floor induction variables.
5113 Builder.restoreIP(Result.back()->getBodyIP());
5114 for (int i = 0; i < NumLoops; ++i) {
5115 CanonicalLoopInfo *FloorLoop = Result[i];
5116 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5117 Value *OrigIndVar = OrigIndVars[i];
5118 Value *Size = TileSizes[i];
5119
5120 Value *Scale =
5121 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5122 Value *Shift =
5123 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5124 OrigIndVar->replaceAllUsesWith(Shift);
5125 }
5126
5127 // Remove unused parts of the original loops.
5128 removeUnusedBlocksFromParent(OldControlBBs);
5129
5130 for (CanonicalLoopInfo *L : Loops)
5131 L->invalidate();
5132
5133#ifndef NDEBUG
5134 for (CanonicalLoopInfo *GenL : Result)
5135 GenL->assertOK();
5136#endif
5137 return Result;
5138}
5139
5140/// Attach metadata \p Properties to the basic block described by \p BB. If the
5141/// basic block already has metadata, the basic block properties are appended.
5143 ArrayRef<Metadata *> Properties) {
5144 // Nothing to do if no property to attach.
5145 if (Properties.empty())
5146 return;
5147
5148 LLVMContext &Ctx = BB->getContext();
5149 SmallVector<Metadata *> NewProperties;
5150 NewProperties.push_back(nullptr);
5151
5152 // If the basic block already has metadata, prepend it to the new metadata.
5153 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5154 if (Existing)
5155 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5156
5157 append_range(NewProperties, Properties);
5158 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5159 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5160
5161 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5162}
5163
5164/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5165/// loop already has metadata, the loop properties are appended.
5167 ArrayRef<Metadata *> Properties) {
5168 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5169
5170 // Attach metadata to the loop's latch
5171 BasicBlock *Latch = Loop->getLatch();
5172 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5173 addBasicBlockMetadata(Latch, Properties);
5174}
5175
5176/// Attach llvm.access.group metadata to the memref instructions of \p Block
5177static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5178 LoopInfo &LI) {
5179 for (Instruction &I : *Block) {
5180 if (I.mayReadOrWriteMemory()) {
5181 // TODO: This instruction may already have access group from
5182 // other pragmas e.g. #pragma clang loop vectorize. Append
5183 // so that the existing metadata is not overwritten.
5184 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5185 }
5186 }
5187}
5188
5192 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5193 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5194}
5195
5199 Loop, {
5200 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5201 });
5202}
5203
5204void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5205 Value *IfCond, ValueToValueMapTy &VMap,
5206 const Twine &NamePrefix) {
5207 Function *F = CanonicalLoop->getFunction();
5208
5209 // Define where if branch should be inserted
5210 Instruction *SplitBefore;
5211 if (Instruction::classof(IfCond)) {
5212 SplitBefore = dyn_cast<Instruction>(IfCond);
5213 } else {
5214 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5215 }
5216
5217 // TODO: We should not rely on pass manager. Currently we use pass manager
5218 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5219 // object. We should have a method which returns all blocks between
5220 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5222 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5223 FAM.registerPass([]() { return LoopAnalysis(); });
5224 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5225
5226 // Get the loop which needs to be cloned
5227 LoopAnalysis LIA;
5228 LoopInfo &&LI = LIA.run(*F, FAM);
5229 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5230
5231 // Create additional blocks for the if statement
5232 BasicBlock *Head = SplitBefore->getParent();
5233 Instruction *HeadOldTerm = Head->getTerminator();
5234 llvm::LLVMContext &C = Head->getContext();
5236 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5238 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5239
5240 // Create if condition branch.
5241 Builder.SetInsertPoint(HeadOldTerm);
5242 Instruction *BrInstr =
5243 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5244 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5245 // Then block contains branch to omp loop which needs to be vectorized
5246 spliceBB(IP, ThenBlock, false);
5247 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5248
5249 Builder.SetInsertPoint(ElseBlock);
5250
5251 // Clone loop for the else branch
5253
5254 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5255 for (BasicBlock *Block : L->getBlocks()) {
5256 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5257 NewBB->moveBefore(CanonicalLoop->getExit());
5258 VMap[Block] = NewBB;
5259 NewBlocks.push_back(NewBB);
5260 }
5261 remapInstructionsInBlocks(NewBlocks, VMap);
5262 Builder.CreateBr(NewBlocks.front());
5263}
5264
5265unsigned
5267 const StringMap<bool> &Features) {
5268 if (TargetTriple.isX86()) {
5269 if (Features.lookup("avx512f"))
5270 return 512;
5271 else if (Features.lookup("avx"))
5272 return 256;
5273 return 128;
5274 }
5275 if (TargetTriple.isPPC())
5276 return 128;
5277 if (TargetTriple.isWasm())
5278 return 128;
5279 return 0;
5280}
5281
5283 MapVector<Value *, Value *> AlignedVars,
5284 Value *IfCond, OrderKind Order,
5285 ConstantInt *Simdlen, ConstantInt *Safelen) {
5287
5288 Function *F = CanonicalLoop->getFunction();
5289
5290 // TODO: We should not rely on pass manager. Currently we use pass manager
5291 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5292 // object. We should have a method which returns all blocks between
5293 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5295 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5296 FAM.registerPass([]() { return LoopAnalysis(); });
5297 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5298
5299 LoopAnalysis LIA;
5300 LoopInfo &&LI = LIA.run(*F, FAM);
5301
5302 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5303 if (AlignedVars.size()) {
5305 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
5306 for (auto &AlignedItem : AlignedVars) {
5307 Value *AlignedPtr = AlignedItem.first;
5308 Value *Alignment = AlignedItem.second;
5309 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5310 AlignedPtr, Alignment);
5311 }
5312 Builder.restoreIP(IP);
5313 }
5314
5315 if (IfCond) {
5316 ValueToValueMapTy VMap;
5317 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5318 // Add metadata to the cloned loop which disables vectorization
5319 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5320 assert(MappedLatch &&
5321 "Cannot find value which corresponds to original loop latch");
5322 assert(isa<BasicBlock>(MappedLatch) &&
5323 "Cannot cast mapped latch block value to BasicBlock");
5324 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5325 ConstantAsMetadata *BoolConst =
5328 NewLatchBlock,
5329 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5330 BoolConst})});
5331 }
5332
5333 SmallSet<BasicBlock *, 8> Reachable;
5334
5335 // Get the basic blocks from the loop in which memref instructions
5336 // can be found.
5337 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5338 // preferably without running any passes.
5339 for (BasicBlock *Block : L->getBlocks()) {
5340 if (Block == CanonicalLoop->getCond() ||
5341 Block == CanonicalLoop->getHeader())
5342 continue;
5343 Reachable.insert(Block);
5344 }
5345
5346 SmallVector<Metadata *> LoopMDList;
5347
5348 // In presence of finite 'safelen', it may be unsafe to mark all
5349 // the memory instructions parallel, because loop-carried
5350 // dependences of 'safelen' iterations are possible.
5351 // If clause order(concurrent) is specified then the memory instructions
5352 // are marked parallel even if 'safelen' is finite.
5353 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5354 // Add access group metadata to memory-access instructions.
5355 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5356 for (BasicBlock *BB : Reachable)
5357 addSimdMetadata(BB, AccessGroup, LI);
5358 // TODO: If the loop has existing parallel access metadata, have
5359 // to combine two lists.
5360 LoopMDList.push_back(MDNode::get(
5361 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5362 }
5363
5364 // Use the above access group metadata to create loop level
5365 // metadata, which should be distinct for each loop.
5366 ConstantAsMetadata *BoolConst =
5368 LoopMDList.push_back(MDNode::get(
5369 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5370
5371 if (Simdlen || Safelen) {
5372 // If both simdlen and safelen clauses are specified, the value of the
5373 // simdlen parameter must be less than or equal to the value of the safelen
5374 // parameter. Therefore, use safelen only in the absence of simdlen.
5375 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5376 LoopMDList.push_back(
5377 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5378 ConstantAsMetadata::get(VectorizeWidth)}));
5379 }
5380
5381 addLoopMetadata(CanonicalLoop, LoopMDList);
5382}
5383
5384/// Create the TargetMachine object to query the backend for optimization
5385/// preferences.
5386///
5387/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5388/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5389/// needed for the LLVM pass pipline. We use some default options to avoid
5390/// having to pass too many settings from the frontend that probably do not
5391/// matter.
5392///
5393/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5394/// method. If we are going to use TargetMachine for more purposes, especially
5395/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5396/// might become be worth requiring front-ends to pass on their TargetMachine,
5397/// or at least cache it between methods. Note that while fontends such as Clang
5398/// have just a single main TargetMachine per translation unit, "target-cpu" and
5399/// "target-features" that determine the TargetMachine are per-function and can
5400/// be overrided using __attribute__((target("OPTIONS"))).
5401static std::unique_ptr<TargetMachine>
5403 Module *M = F->getParent();
5404
5405 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5406 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5407 const std::string &Triple = M->getTargetTriple();
5408
5409 std::string Error;
5411 if (!TheTarget)
5412 return {};
5413
5415 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5416 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5417 /*CodeModel=*/std::nullopt, OptLevel));
5418}
5419
5420/// Heuristically determine the best-performant unroll factor for \p CLI. This
5421/// depends on the target processor. We are re-using the same heuristics as the
5422/// LoopUnrollPass.
5424 Function *F = CLI->getFunction();
5425
5426 // Assume the user requests the most aggressive unrolling, even if the rest of
5427 // the code is optimized using a lower setting.
5429 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5430
5432 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5433 FAM.registerPass([]() { return AssumptionAnalysis(); });
5434 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5435 FAM.registerPass([]() { return LoopAnalysis(); });
5436 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5437 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5438 TargetIRAnalysis TIRA;
5439 if (TM)
5440 TIRA = TargetIRAnalysis(
5441 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5442 FAM.registerPass([&]() { return TIRA; });
5443
5444 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5446 ScalarEvolution &&SE = SEA.run(*F, FAM);
5448 DominatorTree &&DT = DTA.run(*F, FAM);
5449 LoopAnalysis LIA;
5450 LoopInfo &&LI = LIA.run(*F, FAM);
5452 AssumptionCache &&AC = ACT.run(*F, FAM);
5454
5455 Loop *L = LI.getLoopFor(CLI->getHeader());
5456 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5457
5460 /*BlockFrequencyInfo=*/nullptr,
5461 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5462 /*UserThreshold=*/std::nullopt,
5463 /*UserCount=*/std::nullopt,
5464 /*UserAllowPartial=*/true,
5465 /*UserAllowRuntime=*/true,
5466 /*UserUpperBound=*/std::nullopt,
5467 /*UserFullUnrollMaxCount=*/std::nullopt);
5468
5469 UP.Force = true;
5470
5471 // Account for additional optimizations taking place before the LoopUnrollPass
5472 // would unroll the loop.
5475
5476 // Use normal unroll factors even if the rest of the code is optimized for
5477 // size.
5480
5481 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5482 << " Threshold=" << UP.Threshold << "\n"
5483 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5484 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5485 << " PartialOptSizeThreshold="
5486 << UP.PartialOptSizeThreshold << "\n");
5487
5488 // Disable peeling.
5491 /*UserAllowPeeling=*/false,
5492 /*UserAllowProfileBasedPeeling=*/false,
5493 /*UnrollingSpecficValues=*/false);
5494
5496 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5497
5498 // Assume that reads and writes to stack variables can be eliminated by
5499 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5500 // size.
5501 for (BasicBlock *BB : L->blocks()) {
5502 for (Instruction &I : *BB) {
5503 Value *Ptr;
5504 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5505 Ptr = Load->getPointerOperand();
5506 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5507 Ptr = Store->getPointerOperand();
5508 } else
5509 continue;
5510
5511 Ptr = Ptr->stripPointerCasts();
5512
5513 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5514 if (Alloca->getParent() == &F->getEntryBlock())
5515 EphValues.insert(&I);
5516 }
5517 }
5518 }
5519
5520 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5521
5522 // Loop is not unrollable if the loop contains certain instructions.
5523 if (!UCE.canUnroll()) {
5524 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5525 return 1;
5526 }
5527
5528 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5529 << "\n");
5530
5531 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5532 // be able to use it.
5533 int TripCount = 0;
5534 int MaxTripCount = 0;
5535 bool MaxOrZero = false;
5536 unsigned TripMultiple = 0;
5537
5538 bool UseUpperBound = false;
5539 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5540 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5541 UseUpperBound);
5542 unsigned Factor = UP.Count;
5543 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5544
5545 // This function returns 1 to signal to not unroll a loop.
5546 if (Factor == 0)
5547 return 1;
5548 return Factor;
5549}
5550
5552 int32_t Factor,
5553 CanonicalLoopInfo **UnrolledCLI) {
5554 assert(Factor >= 0 && "Unroll factor must not be negative");
5555
5556 Function *F = Loop->getFunction();
5557 LLVMContext &Ctx = F->getContext();
5558
5559 // If the unrolled loop is not used for another loop-associated directive, it
5560 // is sufficient to add metadata for the LoopUnrollPass.
5561 if (!UnrolledCLI) {
5562 SmallVector<Metadata *, 2> LoopMetadata;
5563 LoopMetadata.push_back(
5564 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5565
5566 if (Factor >= 1) {
5568 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5569 LoopMetadata.push_back(MDNode::get(
5570 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5571 }
5572
5573 addLoopMetadata(Loop, LoopMetadata);
5574 return;
5575 }
5576
5577 // Heuristically determine the unroll factor.
5578 if (Factor == 0)
5580
5581 // No change required with unroll factor 1.
5582 if (Factor == 1) {
5583 *UnrolledCLI = Loop;
5584 return;
5585 }
5586
5587 assert(Factor >= 2 &&
5588 "unrolling only makes sense with a factor of 2 or larger");
5589
5590 Type *IndVarTy = Loop->getIndVarType();
5591
5592 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5593 // unroll the inner loop.
5594 Value *FactorVal =
5595 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5596 /*isSigned=*/false));
5597 std::vector<CanonicalLoopInfo *> LoopNest =
5598 tileLoops(DL, {Loop}, {FactorVal});
5599 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5600 *UnrolledCLI = LoopNest[0];
5601 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5602
5603 // LoopUnrollPass can only fully unroll loops with constant trip count.
5604 // Unroll by the unroll factor with a fallback epilog for the remainder
5605 // iterations if necessary.
5607 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5609 InnerLoop,
5610 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5612 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5613
5614#ifndef NDEBUG
5615 (*UnrolledCLI)->assertOK();
5616#endif
5617}
5618
5621 llvm::Value *BufSize, llvm::Value *CpyBuf,
5622 llvm::Value *CpyFn, llvm::Value *DidIt) {
5623 if (!updateToLocation(Loc))
5624 return Loc.IP;
5625
5626 uint32_t SrcLocStrSize;
5627 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5628 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5629 Value *ThreadId = getOrCreateThreadID(Ident);
5630
5631 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5632
5633 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5634
5635 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5636 Builder.CreateCall(Fn, Args);
5637
5638 return Builder.saveIP();
5639}
5640
5642 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5643 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5645
5646 if (!updateToLocation(Loc))
5647 return Loc.IP;
5648
5649 // If needed allocate and initialize `DidIt` with 0.
5650 // DidIt: flag variable: 1=single thread; 0=not single thread.
5651 llvm::Value *DidIt = nullptr;
5652 if (!CPVars.empty()) {
5655 }
5656
5657 Directive OMPD = Directive::OMPD_single;
5658 uint32_t SrcLocStrSize;
5659 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5660 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5661 Value *ThreadId = getOrCreateThreadID(Ident);
5662 Value *Args[] = {Ident, ThreadId};
5663
5664 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5665 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5666
5667 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5668 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5669
5670 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5671 if (Error Err = FiniCB(IP))
5672 return Err;
5673
5674 // The thread that executes the single region must set `DidIt` to 1.
5675 // This is used by __kmpc_copyprivate, to know if the caller is the
5676 // single thread or not.
5677 if (DidIt)
5679
5680 return Error::success();
5681 };
5682
5683 // generates the following:
5684 // if (__kmpc_single()) {
5685 // .... single region ...
5686 // __kmpc_end_single
5687 // }
5688 // __kmpc_copyprivate
5689 // __kmpc_barrier
5690
5691 InsertPointOrErrorTy AfterIP =
5692 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5693 /*Conditional*/ true,
5694 /*hasFinalize*/ true);
5695 if (!AfterIP)
5696 return AfterIP.takeError();
5697
5698 if (DidIt) {
5699 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5700 // NOTE BufSize is currently unused, so just pass 0.
5702 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5703 CPFuncs[I], DidIt);
5704 // NOTE __kmpc_copyprivate already inserts a barrier
5705 } else if (!IsNowait) {
5706 InsertPointOrErrorTy AfterIP =
5708 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5709 /* CheckCancelFlag */ false);
5710 if (!AfterIP)
5711 return AfterIP.takeError();
5712 }
5713 return Builder.saveIP();
5714}
5715
5717 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5718 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5719
5720 if (!updateToLocation(Loc))
5721 return Loc.IP;
5722
5723 Directive OMPD = Directive::OMPD_critical;
5724 uint32_t SrcLocStrSize;
5725 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5726 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5727 Value *ThreadId = getOrCreateThreadID(Ident);
5728 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5729 Value *Args[] = {Ident, ThreadId, LockVar};
5730
5731 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5732 Function *RTFn = nullptr;
5733 if (HintInst) {
5734 // Add Hint to entry Args and create call
5735 EnterArgs.push_back(HintInst);
5736 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5737 } else {
5738 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5739 }
5740 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5741
5742 Function *ExitRTLFn =
5743 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5744 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5745
5746 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5747 /*Conditional*/ false, /*hasFinalize*/ true);
5748}
5749
5752 InsertPointTy AllocaIP, unsigned NumLoops,
5753 ArrayRef<llvm::Value *> StoreValues,
5754 const Twine &Name, bool IsDependSource) {
5755 assert(
5756 llvm::all_of(StoreValues,
5757 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5758 "OpenMP runtime requires depend vec with i64 type");
5759
5760 if (!updateToLocation(Loc))
5761 return Loc.IP;
5762
5763 // Allocate space for vector and generate alloc instruction.
5764 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5765 Builder.restoreIP(AllocaIP);
5766 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5767 ArgsBase->setAlignment(Align(8));
5768 Builder.restoreIP(Loc.IP);
5769
5770 // Store the index value with offset in depend vector.
5771 for (unsigned I = 0; I < NumLoops; ++I) {
5772 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5773 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5774 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5775 STInst->setAlignment(Align(8));
5776 }
5777
5778 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5779 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5780
5781 uint32_t SrcLocStrSize;
5782 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5783 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5784 Value *ThreadId = getOrCreateThreadID(Ident);
5785 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5786
5787 Function *RTLFn = nullptr;
5788 if (IsDependSource)
5789 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5790 else
5791 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5792 Builder.CreateCall(RTLFn, Args);
5793
5794 return Builder.saveIP();
5795}
5796
5798 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5799 FinalizeCallbackTy FiniCB, bool IsThreads) {
5800 if (!updateToLocation(Loc))
5801 return Loc.IP;
5802
5803 Directive OMPD = Directive::OMPD_ordered;
5804 Instruction *EntryCall = nullptr;
5805 Instruction *ExitCall = nullptr;
5806
5807 if (IsThreads) {
5808 uint32_t SrcLocStrSize;
5809 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5810 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5811 Value *ThreadId = getOrCreateThreadID(Ident);
5812 Value *Args[] = {Ident, ThreadId};
5813
5814 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5815 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5816
5817 Function *ExitRTLFn =
5818 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5819 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5820 }
5821
5822 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5823 /*Conditional*/ false, /*hasFinalize*/ true);
5824}
5825
5826OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5827 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5828 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5829 bool HasFinalize, bool IsCancellable) {
5830
5831 if (HasFinalize)
5832 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5833
5834 // Create inlined region's entry and body blocks, in preparation
5835 // for conditional creation
5836 BasicBlock *EntryBB = Builder.GetInsertBlock();
5837 Instruction *SplitPos = EntryBB->getTerminator();
5838 if (!isa_and_nonnull<BranchInst>(SplitPos))
5839 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5840 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5841 BasicBlock *FiniBB =
5842 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5843
5845 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5846
5847 // generate body
5848 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5849 /* CodeGenIP */ Builder.saveIP()))
5850 return Err;
5851
5852 // emit exit call and do any needed finalization.
5853 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5854 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5855 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5856 "Unexpected control flow graph state!!");
5857 InsertPointOrErrorTy AfterIP =
5858 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5859 if (!AfterIP)
5860 return AfterIP.takeError();
5861 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5862 "Unexpected Control Flow State!");
5864
5865 // If we are skipping the region of a non conditional, remove the exit
5866 // block, and clear the builder's insertion point.
5867 assert(SplitPos->getParent() == ExitBB &&
5868 "Unexpected Insertion point location!");
5869 auto merged = MergeBlockIntoPredecessor(ExitBB);
5870 BasicBlock *ExitPredBB = SplitPos->getParent();
5871 auto InsertBB = merged ? ExitPredBB : ExitBB;
5872 if (!isa_and_nonnull<BranchInst>(SplitPos))
5873 SplitPos->eraseFromParent();
5874 Builder.SetInsertPoint(InsertBB);
5875
5876 return Builder.saveIP();
5877}
5878
5879OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5880 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5881 // if nothing to do, Return current insertion point.
5882 if (!Conditional || !EntryCall)
5883 return Builder.saveIP();
5884
5885 BasicBlock *EntryBB = Builder.GetInsertBlock();
5886 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5887 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5888 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5889
5890 // Emit thenBB and set the Builder's insertion point there for
5891 // body generation next. Place the block after the current block.
5892 Function *CurFn = EntryBB->getParent();
5893 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5894
5895 // Move Entry branch to end of ThenBB, and replace with conditional
5896 // branch (If-stmt)
5897 Instruction *EntryBBTI = EntryBB->getTerminator();
5898 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5899 EntryBBTI->removeFromParent();
5901 Builder.Insert(EntryBBTI);
5902 UI->eraseFromParent();
5904
5905 // return an insertion point to ExitBB.
5906 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5907}
5908
5909OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5910 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5911 bool HasFinalize) {
5912
5913 Builder.restoreIP(FinIP);
5914
5915 // If there is finalization to do, emit it before the exit call
5916 if (HasFinalize) {
5917 assert(!FinalizationStack.empty() &&
5918 "Unexpected finalization stack state!");
5919
5920 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5921 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5922
5923 if (Error Err = Fi.FiniCB(FinIP))
5924 return Err;
5925
5926 BasicBlock *FiniBB = FinIP.getBlock();
5927 Instruction *FiniBBTI = FiniBB->getTerminator();
5928
5929 // set Builder IP for call creation
5930 Builder.SetInsertPoint(FiniBBTI);
5931 }
5932
5933 if (!ExitCall)
5934 return Builder.saveIP();
5935
5936 // place the Exitcall as last instruction before Finalization block terminator
5937 ExitCall->removeFromParent();
5938 Builder.Insert(ExitCall);
5939
5940 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5941 ExitCall->getIterator());
5942}
5943
5945 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5946 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5947 if (!IP.isSet())
5948 return IP;
5949
5951
5952 // creates the following CFG structure
5953 // OMP_Entry : (MasterAddr != PrivateAddr)?
5954 // F T
5955 // | \
5956 // | copin.not.master
5957 // | /
5958 // v /
5959 // copyin.not.master.end
5960 // |
5961 // v
5962 // OMP.Entry.Next
5963
5964 BasicBlock *OMP_Entry = IP.getBlock();
5965 Function *CurFn = OMP_Entry->getParent();
5966 BasicBlock *CopyBegin =
5967 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5968 BasicBlock *CopyEnd = nullptr;
5969
5970 // If entry block is terminated, split to preserve the branch to following
5971 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5972 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5973 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5974 "copyin.not.master.end");
5975 OMP_Entry->getTerminator()->eraseFromParent();
5976 } else {
5977 CopyEnd =
5978 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5979 }
5980
5981 Builder.SetInsertPoint(OMP_Entry);
5982 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5983 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5984 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5985 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5986
5987 Builder.SetInsertPoint(CopyBegin);
5988 if (BranchtoEnd)
5990
5991 return Builder.saveIP();
5992}
5993
5995 Value *Size, Value *Allocator,
5996 std::string Name) {
5998 updateToLocation(Loc);
5999
6000 uint32_t SrcLocStrSize;
6001 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6002 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6003 Value *ThreadId = getOrCreateThreadID(Ident);
6004 Value *Args[] = {ThreadId, Size, Allocator};
6005
6006 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6007
6008 return Builder.CreateCall(Fn, Args, Name);
6009}
6010
6012 Value *Addr, Value *Allocator,
6013 std::string Name) {
6015 updateToLocation(Loc);
6016
6017 uint32_t SrcLocStrSize;
6018 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6019 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6020 Value *ThreadId = getOrCreateThreadID(Ident);
6021 Value *Args[] = {ThreadId, Addr, Allocator};
6022 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6023 return Builder.CreateCall(Fn, Args, Name);
6024}
6025
6027 const LocationDescription &Loc, Value *InteropVar,
6028 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6029 Value *DependenceAddress, bool HaveNowaitClause) {
6031 updateToLocation(Loc);
6032
6033 uint32_t SrcLocStrSize;
6034 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6035 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6036 Value *ThreadId = getOrCreateThreadID(Ident);
6037 if (Device == nullptr)
6039 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6040 if (NumDependences == nullptr) {
6041 NumDependences = ConstantInt::get(Int32, 0);
6042 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6043 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6044 }
6045 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6046 Value *Args[] = {
6047 Ident, ThreadId, InteropVar, InteropTypeVal,
6048 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6049
6050 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6051
6052 return Builder.CreateCall(Fn, Args);
6053}
6054
6056 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6057 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6059 updateToLocation(Loc);
6060
6061 uint32_t SrcLocStrSize;
6062 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6063 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6064 Value *ThreadId = getOrCreateThreadID(Ident);
6065 if (Device == nullptr)
6067 if (NumDependences == nullptr) {
6068 NumDependences = ConstantInt::get(Int32, 0);
6069 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6070 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6071 }
6072 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6073 Value *Args[] = {
6074 Ident, ThreadId, InteropVar, Device,
6075 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6076
6077 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6078
6079 return Builder.CreateCall(Fn, Args);
6080}
6081
6083 Value *InteropVar, Value *Device,
6084 Value *NumDependences,
6085 Value *DependenceAddress,
6086 bool HaveNowaitClause) {
6088 updateToLocation(Loc);
6089 uint32_t SrcLocStrSize;
6090 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6091 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6092 Value *ThreadId = getOrCreateThreadID(Ident);
6093 if (Device == nullptr)
6095 if (NumDependences == nullptr) {
6096 NumDependences = ConstantInt::get(Int32, 0);
6097 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6098 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6099 }
6100 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6101 Value *Args[] = {
6102 Ident, ThreadId, InteropVar, Device,
6103 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6104
6105 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6106
6107 return Builder.CreateCall(Fn, Args);
6108}
6109
6111 const LocationDescription &Loc, llvm::Value *Pointer,
6114 updateToLocation(Loc);
6115
6116 uint32_t SrcLocStrSize;
6117 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6118 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6119 Value *ThreadId = getOrCreateThreadID(Ident);
6120 Constant *ThreadPrivateCache =
6121 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6122 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6123
6124 Function *Fn =
6125 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6126
6127 return Builder.CreateCall(Fn, Args);
6128}
6129
6132 int32_t MinThreadsVal, int32_t MaxThreadsVal,
6133 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
6134 if (!updateToLocation(Loc))
6135 return Loc.IP;
6136
6137 uint32_t SrcLocStrSize;
6138 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6139 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6140 Constant *IsSPMDVal = ConstantInt::getSigned(
6142 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
6143 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6144 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6145
6146 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6147 Function *Kernel = DebugKernelWrapper;
6148
6149 // We need to strip the debug prefix to get the correct kernel name.
6150 StringRef KernelName = Kernel->getName();
6151 const std::string DebugPrefix = "_debug__";
6152 if (KernelName.ends_with(DebugPrefix)) {
6153 KernelName = KernelName.drop_back(DebugPrefix.length());
6154 Kernel = M.getFunction(KernelName);
6155 assert(Kernel && "Expected the real kernel to exist");
6156 }
6157
6158 // Manifest the launch configuration in the metadata matching the kernel
6159 // environment.
6160 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
6161 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
6162
6163 // For max values, < 0 means unset, == 0 means set but unknown.
6164 if (MaxThreadsVal < 0)
6165 MaxThreadsVal = std::max(
6166 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
6167
6168 if (MaxThreadsVal > 0)
6169 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
6170
6171 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
6173 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
6174 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
6175 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6176 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6177
6179 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6180 const DataLayout &DL = Fn->getDataLayout();
6181
6182 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6183 Constant *DynamicEnvironmentInitializer =
6184 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6185 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6186 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6187 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6188 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6189 DL.getDefaultGlobalsAddressSpace());
6190 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6191
6192 Constant *DynamicEnvironment =
6193 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6194 ? DynamicEnvironmentGV
6195 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6196 DynamicEnvironmentPtr);
6197
6198 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6199 ConfigurationEnvironment, {
6200 UseGenericStateMachineVal,
6201 MayUseNestedParallelismVal,
6202 IsSPMDVal,
6203 MinThreads,
6204 MaxThreads,
6205 MinTeams,
6206 MaxTeams,
6207 ReductionDataSize,
6208 ReductionBufferLength,
6209 });
6210 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6211 KernelEnvironment, {
6212 ConfigurationEnvironmentInitializer,
6213 Ident,
6214 DynamicEnvironment,
6215 });
6216 std::string KernelEnvironmentName =
6217 (KernelName + "_kernel_environment").str();
6218 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6219 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6220 KernelEnvironmentInitializer, KernelEnvironmentName,
6221 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6222 DL.getDefaultGlobalsAddressSpace());
6223 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6224
6225 Constant *KernelEnvironment =
6226 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6227 ? KernelEnvironmentGV
6228 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6229 KernelEnvironmentPtr);
6230 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6231 CallInst *ThreadKind =
6232 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6233
6234 Value *ExecUserCode = Builder.CreateICmpEQ(
6235 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6236 "exec_user_code");
6237
6238 // ThreadKind = __kmpc_target_init(...)
6239 // if (ThreadKind == -1)
6240 // user_code
6241 // else
6242 // return;
6243
6244 auto *UI = Builder.CreateUnreachable();
6245 BasicBlock *CheckBB = UI->getParent();
6246 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6247
6248 BasicBlock *WorkerExitBB = BasicBlock::Create(
6249 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6250 Builder.SetInsertPoint(WorkerExitBB);
6252
6253 auto *CheckBBTI = CheckBB->getTerminator();
6254 Builder.SetInsertPoint(CheckBBTI);
6255 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6256
6257 CheckBBTI->eraseFromParent();
6258 UI->eraseFromParent();
6259
6260 // Continue in the "user_code" block, see diagram above and in
6261 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6262 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6263}
6264
6266 int32_t TeamsReductionDataSize,
6267 int32_t TeamsReductionBufferLength) {
6268 if (!updateToLocation(Loc))
6269 return;
6270
6272 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6273
6274 Builder.CreateCall(Fn, {});
6275
6276 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6277 return;
6278
6280 // We need to strip the debug prefix to get the correct kernel name.
6281 StringRef KernelName = Kernel->getName();
6282 const std::string DebugPrefix = "_debug__";
6283 if (KernelName.ends_with(DebugPrefix))
6284 KernelName = KernelName.drop_back(DebugPrefix.length());
6285 auto *KernelEnvironmentGV =
6286 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6287 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6288 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6289 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6290 KernelEnvironmentInitializer,
6291 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6292 NewInitializer = ConstantFoldInsertValueInstruction(
6293 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6294 {0, 8});
6295 KernelEnvironmentGV->setInitializer(NewInitializer);
6296}
6297
6299 Module &M = *Kernel.getParent();
6300 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6301 for (auto *Op : MD->operands()) {
6302 if (Op->getNumOperands() != 3)
6303 continue;
6304 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6305 if (!KernelOp || KernelOp->getValue() != &Kernel)
6306 continue;
6307 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6308 if (!Prop || Prop->getString() != Name)
6309 continue;
6310 return Op;
6311 }
6312 return nullptr;
6313}
6314
6316 bool Min) {
6317 // Update the "maxntidx" metadata for NVIDIA, or add it.
6318 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6319 if (ExistingOp) {
6320 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6321 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6322 ExistingOp->replaceOperandWith(
6323 2, ConstantAsMetadata::get(ConstantInt::get(
6324 OldVal->getValue()->getType(),
6325 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6326 } else {
6327 LLVMContext &Ctx = Kernel.getContext();
6329 MDString::get(Ctx, Name),
6331 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6332 // Append metadata to nvvm.annotations
6333 Module &M = *Kernel.getParent();
6334 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6335 MD->addOperand(MDNode::get(Ctx, MDVals));
6336 }
6337}
6338
6339std::pair<int32_t, int32_t>
6341 int32_t ThreadLimit =
6342 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6343
6344 if (T.isAMDGPU()) {
6345 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6346 if (!Attr.isValid() || !Attr.isStringAttribute())
6347 return {0, ThreadLimit};
6348 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6349 int32_t LB, UB;
6350 if (!llvm::to_integer(UBStr, UB, 10))
6351 return {0, ThreadLimit};
6352 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6353 if (!llvm::to_integer(LBStr, LB, 10))
6354 return {0, UB};
6355 return {LB, UB};
6356 }
6357
6358 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6359 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6360 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6361 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6362 }
6363 return {0, ThreadLimit};
6364}
6365
6367 Function &Kernel, int32_t LB,
6368 int32_t UB) {
6369 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6370
6371 if (T.isAMDGPU()) {
6372 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6373 llvm::utostr(LB) + "," + llvm::utostr(UB));
6374 return;
6375 }
6376
6377 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6378}
6379
6380std::pair<int32_t, int32_t>
6382 // TODO: Read from backend annotations if available.
6383 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6384}
6385
6387 int32_t LB, int32_t UB) {
6388 if (T.isNVPTX())
6389 if (UB > 0)
6390 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6391 if (T.isAMDGPU())
6392 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6393
6394 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6395}
6396
6397void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6398 Function *OutlinedFn) {
6399 if (Config.isTargetDevice()) {
6401 // TODO: Determine if DSO local can be set to true.
6402 OutlinedFn->setDSOLocal(false);
6404 if (T.isAMDGCN())
6406 }
6407}
6408
6409Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6410 StringRef EntryFnIDName) {
6411 if (Config.isTargetDevice()) {
6412 assert(OutlinedFn && "The outlined function must exist if embedded");
6413 return OutlinedFn;
6414 }
6415
6416 return new GlobalVariable(
6417 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6418 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6419}
6420
6421Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6422 StringRef EntryFnName) {
6423 if (OutlinedFn)
6424 return OutlinedFn;
6425
6426 assert(!M.getGlobalVariable(EntryFnName, true) &&
6427 "Named kernel already exists?");
6428 return new GlobalVariable(
6429 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6430 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6431}
6432
6434 TargetRegionEntryInfo &EntryInfo,
6435 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6436 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6437
6438 SmallString<64> EntryFnName;
6439 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6440
6442 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6443 if (!CBResult)
6444 return CBResult.takeError();
6445 OutlinedFn = *CBResult;
6446 } else {
6447 OutlinedFn = nullptr;
6448 }
6449
6450 // If this target outline function is not an offload entry, we don't need to
6451 // register it. This may be in the case of a false if clause, or if there are
6452 // no OpenMP targets.
6453 if (!IsOffloadEntry)
6454 return Error::success();
6455
6456 std::string EntryFnIDName =
6458 ? std::string(EntryFnName)
6459 : createPlatformSpecificName({EntryFnName, "region_id"});
6460
6461 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6462 EntryFnName, EntryFnIDName);
6463 return Error::success();
6464}
6465
6467 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6468 StringRef EntryFnName, StringRef EntryFnIDName) {
6469 if (OutlinedFn)
6470 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6471 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6472 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6474 EntryInfo, EntryAddr, OutlinedFnID,
6476 return OutlinedFnID;
6477}
6478
6480 const LocationDescription &Loc, InsertPointTy AllocaIP,
6481 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6482 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6483 omp::RuntimeFunction *MapperFunc,
6485 BodyGenTy BodyGenType)>
6486 BodyGenCB,
6487 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6488 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6489 if (!updateToLocation(Loc))
6490 return InsertPointTy();
6491
6492 Builder.restoreIP(CodeGenIP);
6493 // Disable TargetData CodeGen on Device pass.
6494 if (Config.IsTargetDevice.value_or(false)) {
6495 if (BodyGenCB) {
6496 InsertPointOrErrorTy AfterIP =
6497 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6498 if (!AfterIP)
6499 return AfterIP.takeError();
6500 Builder.restoreIP(*AfterIP);
6501 }
6502 return Builder.saveIP();
6503 }
6504
6505 bool IsStandAlone = !BodyGenCB;
6506 MapInfosTy *MapInfo;
6507 // Generate the code for the opening of the data environment. Capture all the
6508 // arguments of the runtime call by reference because they are used in the
6509 // closing of the region.
6510 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6511 InsertPointTy CodeGenIP) -> Error {
6512 MapInfo = &GenMapInfoCB(Builder.saveIP());
6513 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6514 /*IsNonContiguous=*/true, DeviceAddrCB,
6515 CustomMapperCB);
6516
6517 TargetDataRTArgs RTArgs;
6519
6520 // Emit the number of elements in the offloading arrays.
6521 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6522
6523 // Source location for the ident struct
6524 if (!SrcLocInfo) {
6525 uint32_t SrcLocStrSize;
6526 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6527 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6528 }
6529
6530 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6531 SrcLocInfo, DeviceID,
6532 PointerNum, RTArgs.BasePointersArray,
6533 RTArgs.PointersArray, RTArgs.SizesArray,
6534 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6535 RTArgs.MappersArray};
6536
6537 if (IsStandAlone) {
6538 assert(MapperFunc && "MapperFunc missing for standalone target data");
6539
6540 auto TaskBodyCB = [&](Value *, Value *,
6542 if (Info.HasNoWait) {
6543 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6547 }
6548
6550 OffloadingArgs);
6551
6552 if (Info.HasNoWait) {
6553 BasicBlock *OffloadContBlock =
6554 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6556 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6558 }
6559 return Error::success();
6560 };
6561
6562 bool RequiresOuterTargetTask = Info.HasNoWait;
6563 if (!RequiresOuterTargetTask) {
6564 Error Err = TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6565 /*TargetTaskAllocaIP=*/{});
6566 assert(!Err && "TaskBodyCB expected to succeed");
6567 } else {
6568 InsertPointOrErrorTy AfterIP =
6569 emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6570 /*Dependencies=*/{}, Info.HasNoWait);
6571 assert(AfterIP && "TaskBodyCB expected to succeed");
6572 }
6573 } else {
6574 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6575 omp::OMPRTL___tgt_target_data_begin_mapper);
6576
6577 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6578
6579 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6580 if (isa<AllocaInst>(DeviceMap.second.second)) {
6581 auto *LI =
6582 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6583 Builder.CreateStore(LI, DeviceMap.second.second);
6584 }
6585 }
6586
6587 // If device pointer privatization is required, emit the body of the
6588 // region here. It will have to be duplicated: with and without
6589 // privatization.
6590 InsertPointOrErrorTy AfterIP =
6591 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6592 if (!AfterIP)
6593 return AfterIP.takeError();
6594 Builder.restoreIP(*AfterIP);
6595 }
6596 return Error::success();
6597 };
6598
6599 // If we need device pointer privatization, we need to emit the body of the
6600 // region with no privatization in the 'else' branch of the conditional.
6601 // Otherwise, we don't have to do anything.
6602 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6603 InsertPointTy CodeGenIP) -> Error {
6604 InsertPointOrErrorTy AfterIP =
6605 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6606 if (!AfterIP)
6607 return AfterIP.takeError();
6608 Builder.restoreIP(*AfterIP);
6609 return Error::success();
6610 };
6611
6612 // Generate code for the closing of the data region.
6613 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6614 TargetDataRTArgs RTArgs;
6615 Info.EmitDebug = !MapInfo->Names.empty();
6616 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6617
6618 // Emit the number of elements in the offloading arrays.
6619 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6620
6621 // Source location for the ident struct
6622 if (!SrcLocInfo) {
6623 uint32_t SrcLocStrSize;
6624 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6625 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6626 }
6627
6628 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6629 PointerNum, RTArgs.BasePointersArray,
6630 RTArgs.PointersArray, RTArgs.SizesArray,
6631 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6632 RTArgs.MappersArray};
6633 Function *EndMapperFunc =
6634 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6635
6636 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6637 return Error::success();
6638 };
6639
6640 // We don't have to do anything to close the region if the if clause evaluates
6641 // to false.
6642 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6643 return Error::success();
6644 };
6645
6646 Error Err = [&]() -> Error {
6647 if (BodyGenCB) {
6648 Error Err = [&]() {
6649 if (IfCond)
6650 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6651 return BeginThenGen(AllocaIP, Builder.saveIP());
6652 }();
6653
6654 if (Err)
6655 return Err;
6656
6657 // If we don't require privatization of device pointers, we emit the body
6658 // in between the runtime calls. This avoids duplicating the body code.
6659 InsertPointOrErrorTy AfterIP =
6660 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6661 if (!AfterIP)
6662 return AfterIP.takeError();
6663 Builder.restoreIP(*AfterIP);
6664
6665 if (IfCond)
6666 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6667 return EndThenGen(AllocaIP, Builder.saveIP());
6668 }
6669 if (IfCond)
6670 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6671 return BeginThenGen(AllocaIP, Builder.saveIP());
6672 }();
6673
6674 if (Err)
6675 return Err;
6676
6677 return Builder.saveIP();
6678}
6679
6682 bool IsGPUDistribute) {
6683 assert((IVSize == 32 || IVSize == 64) &&
6684 "IV size is not compatible with the omp runtime");
6686 if (IsGPUDistribute)
6687 Name = IVSize == 32
6688 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6689 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6690 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6691 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6692 else
6693 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6694 : omp::OMPRTL___kmpc_for_static_init_4u)
6695 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6696 : omp::OMPRTL___kmpc_for_static_init_8u);
6697
6699}
6700
6702 bool IVSigned) {
6703 assert((IVSize == 32 || IVSize == 64) &&
6704 "IV size is not compatible with the omp runtime");
6705 RuntimeFunction Name = IVSize == 32
6706 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6707 : omp::OMPRTL___kmpc_dispatch_init_4u)
6708 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6709 : omp::OMPRTL___kmpc_dispatch_init_8u);
6710
6712}
6713
6715 bool IVSigned) {
6716 assert((IVSize == 32 || IVSize == 64) &&
6717 "IV size is not compatible with the omp runtime");
6718 RuntimeFunction Name = IVSize == 32
6719 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6720 : omp::OMPRTL___kmpc_dispatch_next_4u)
6721 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6722 : omp::OMPRTL___kmpc_dispatch_next_8u);
6723
6725}
6726
6728 bool IVSigned) {
6729 assert((IVSize == 32 || IVSize == 64) &&
6730 "IV size is not compatible with the omp runtime");
6731 RuntimeFunction Name = IVSize == 32
6732 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6733 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6734 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6735 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6736
6738}
6739
6741 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6742}
6743
6745 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6749 SmallVector<Type *> ParameterTypes;
6750 if (OMPBuilder.Config.isTargetDevice()) {
6751 // Add the "implicit" runtime argument we use to provide launch specific
6752 // information for target devices.
6753 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6754 ParameterTypes.push_back(Int8PtrTy);
6755
6756 // All parameters to target devices are passed as pointers
6757 // or i64. This assumes 64-bit address spaces/pointers.
6758 for (auto &Arg : Inputs)
6759 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6760 ? Arg->getType()
6761 : Type::getInt64Ty(Builder.getContext()));
6762 } else {
6763 for (auto &Arg : Inputs)
6764 ParameterTypes.push_back(Arg->getType());
6765 }
6766
6767 auto BB = Builder.GetInsertBlock();
6768 auto M = BB->getModule();
6769 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6770 /*isVarArg*/ false);
6771 auto Func =
6772 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6773
6774 // Save insert point.
6775 IRBuilder<>::InsertPointGuard IPG(Builder);
6776 // If there's a DISubprogram associated with current function, then
6777 // generate one for the outlined function.
6778 if (Function *ParentFunc = BB->getParent()) {
6779 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6780 DICompileUnit *CU = SP->getUnit();
6781 DIBuilder DB(*M, true, CU);
6783 if (DL) {
6784 // TODO: We are using nullopt for arguments at the moment. This will
6785 // need to be updated when debug data is being generated for variables.
6786 DISubroutineType *Ty =
6787 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6788 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6789 DISubprogram::SPFlagOptimized |
6790 DISubprogram::SPFlagLocalToUnit;
6791
6792 DISubprogram *OutlinedSP = DB.createFunction(
6793 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6794 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6795
6796 // Attach subprogram to the function.
6797 Func->setSubprogram(OutlinedSP);
6798 // Update the CurrentDebugLocation in the builder so that right scope
6799 // is used for things inside outlined function.
6801 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6802 OutlinedSP, DL.getInlinedAt()));
6803 }
6804 }
6805 }
6806
6807 // Generate the region into the function.
6808 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6809 Builder.SetInsertPoint(EntryBB);
6810
6811 // Insert target init call in the device compilation pass.
6812 if (OMPBuilder.Config.isTargetDevice())
6813 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6814
6815 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6816
6817 // As we embed the user code in the middle of our target region after we
6818 // generate entry code, we must move what allocas we can into the entry
6819 // block to avoid possible breaking optimisations for device
6820 if (OMPBuilder.Config.isTargetDevice())
6822
6823 // Insert target deinit call in the device compilation pass.
6824 BasicBlock *OutlinedBodyBB =
6825 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6827 Builder.saveIP(),
6828 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6829 if (!AfterIP)
6830 return AfterIP.takeError();
6831 Builder.restoreIP(*AfterIP);
6832 if (OMPBuilder.Config.isTargetDevice())
6833 OMPBuilder.createTargetDeinit(Builder);
6834
6835 // Insert return instruction.
6836 Builder.CreateRetVoid();
6837
6838 // New Alloca IP at entry point of created device function.
6839 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6840 auto AllocaIP = Builder.saveIP();
6841
6842 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6843
6844 // Skip the artificial dyn_ptr on the device.
6845 const auto &ArgRange =
6846 OMPBuilder.Config.isTargetDevice()
6847 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6848 : Func->args();
6849
6850 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6851 // Things like GEP's can come in the form of Constants. Constants and
6852 // ConstantExpr's do not have access to the knowledge of what they're
6853 // contained in, so we must dig a little to find an instruction so we
6854 // can tell if they're used inside of the function we're outlining. We
6855 // also replace the original constant expression with a new instruction
6856 // equivalent; an instruction as it allows easy modification in the
6857 // following loop, as we can now know the constant (instruction) is
6858 // owned by our target function and replaceUsesOfWith can now be invoked
6859 // on it (cannot do this with constants it seems). A brand new one also
6860 // allows us to be cautious as it is perhaps possible the old expression
6861 // was used inside of the function but exists and is used externally
6862 // (unlikely by the nature of a Constant, but still).
6863 // NOTE: We cannot remove dead constants that have been rewritten to
6864 // instructions at this stage, we run the risk of breaking later lowering
6865 // by doing so as we could still be in the process of lowering the module
6866 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6867 // constants we have created rewritten versions of.
6868 if (auto *Const = dyn_cast<Constant>(Input))
6869 convertUsersOfConstantsToInstructions(Const, Func, false);
6870
6871 // Collect all the instructions
6872 for (User *User : make_early_inc_range(Input->users()))
6873 if (auto *Instr = dyn_cast<Instruction>(User))
6874 if (Instr->getFunction() == Func)
6875 Instr->replaceUsesOfWith(Input, InputCopy);
6876 };
6877
6878 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6879
6880 // Rewrite uses of input valus to parameters.
6881 for (auto InArg : zip(Inputs, ArgRange)) {
6882 Value *Input = std::get<0>(InArg);
6883 Argument &Arg = std::get<1>(InArg);
6884 Value *InputCopy = nullptr;
6885
6887 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6888 if (!AfterIP)
6889 return AfterIP.takeError();
6890 Builder.restoreIP(*AfterIP);
6891
6892 // In certain cases a Global may be set up for replacement, however, this
6893 // Global may be used in multiple arguments to the kernel, just segmented
6894 // apart, for example, if we have a global array, that is sectioned into
6895 // multiple mappings (technically not legal in OpenMP, but there is a case
6896 // in Fortran for Common Blocks where this is neccesary), we will end up
6897 // with GEP's into this array inside the kernel, that refer to the Global
6898 // but are technically seperate arguments to the kernel for all intents and
6899 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6900 // index, it will fold into an referal to the Global, if we then encounter
6901 // this folded GEP during replacement all of the references to the
6902 // Global in the kernel will be replaced with the argument we have generated
6903 // that corresponds to it, including any other GEP's that refer to the
6904 // Global that may be other arguments. This will invalidate all of the other
6905 // preceding mapped arguments that refer to the same global that may be
6906 // seperate segments. To prevent this, we defer global processing until all
6907 // other processing has been performed.
6908 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6909 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6910 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6911 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6912 continue;
6913 }
6914
6915 ReplaceValue(Input, InputCopy, Func);
6916 }
6917
6918 // Replace all of our deferred Input values, currently just Globals.
6919 for (auto Deferred : DeferredReplacement)
6920 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6921
6922 return Func;
6923}
6924
6925/// Create an entry point for a target task with the following.
6926/// It'll have the following signature
6927/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6928/// This function is called from emitTargetTask once the
6929/// code to launch the target kernel has been outlined already.
6931 IRBuilderBase &Builder,
6932 CallInst *StaleCI) {
6933 Module &M = OMPBuilder.M;
6934 // KernelLaunchFunction is the target launch function, i.e.
6935 // the function that sets up kernel arguments and calls
6936 // __tgt_target_kernel to launch the kernel on the device.
6937 //
6938 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6939
6940 // StaleCI is the CallInst which is the call to the outlined
6941 // target kernel launch function. If there are values that the
6942 // outlined function uses then these are aggregated into a structure
6943 // which is passed as the second argument. If not, then there's
6944 // only one argument, the threadID. So, StaleCI can be
6945 //
6946 // %structArg = alloca { ptr, ptr }, align 8
6947 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6948 // store ptr %20, ptr %gep_, align 8
6949 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6950 // store ptr %21, ptr %gep_8, align 8
6951 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6952 //
6953 // OR
6954 //
6955 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6957 StaleCI->getIterator());
6958 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6959 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6960 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6961 Type *TaskTy = OMPBuilder.Task;
6962 auto ProxyFnTy =
6963 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6964 /* isVarArg */ false);
6965 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6966 ".omp_target_task_proxy_func",
6967 Builder.GetInsertBlock()->getModule());
6968 ProxyFn->getArg(0)->setName("thread.id");
6969 ProxyFn->getArg(1)->setName("task");
6970
6971 BasicBlock *EntryBB =
6972 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6973 Builder.SetInsertPoint(EntryBB);
6974
6975 bool HasShareds = StaleCI->arg_size() > 1;
6976 // TODO: This is a temporary assert to prove to ourselves that
6977 // the outlined target launch function is always going to have
6978 // atmost two arguments if there is any data shared between
6979 // host and device.
6980 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
6981 "StaleCI with shareds should have exactly two arguments.");
6982 if (HasShareds) {
6983 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6984 assert(ArgStructAlloca &&
6985 "Unable to find the alloca instruction corresponding to arguments "
6986 "for extracted function");
6987 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
6988
6989 AllocaInst *NewArgStructAlloca =
6990 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
6991 Value *TaskT = ProxyFn->getArg(1);
6992 Value *ThreadId = ProxyFn->getArg(0);
6993 Value *SharedsSize =
6994 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
6995
6996 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
6997 LoadInst *LoadShared =
6998 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
6999
7000 Builder.CreateMemCpy(
7001 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7002 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7003
7004 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7005 }
7006 Builder.CreateRetVoid();
7007 return ProxyFn;
7008}
7009
7011 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7012 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
7013 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
7016
7017 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7018 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
7019 &ArgAccessorFuncCB](StringRef EntryFnName) {
7020 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
7021 CBFunc, ArgAccessorFuncCB);
7022 };
7023
7024 return OMPBuilder.emitTargetRegionFunction(
7025 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7026 OutlinedFnID);
7027}
7028
7030 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7033 bool HasNoWait) {
7034
7035 // The following explains the code-gen scenario for the `target` directive. A
7036 // similar scneario is followed for other device-related directives (e.g.
7037 // `target enter data`) but in similar fashion since we only need to emit task
7038 // that encapsulates the proper runtime call.
7039 //
7040 // When we arrive at this function, the target region itself has been
7041 // outlined into the function OutlinedFn.
7042 // So at ths point, for
7043 // --------------------------------------------------
7044 // void user_code_that_offloads(...) {
7045 // omp target depend(..) map(from:a) map(to:b, c)
7046 // a = b + c
7047 // }
7048 //
7049 // --------------------------------------------------
7050 //
7051 // we have
7052 //
7053 // --------------------------------------------------
7054 //
7055 // void user_code_that_offloads(...) {
7056 // %.offload_baseptrs = alloca [3 x ptr], align 8
7057 // %.offload_ptrs = alloca [3 x ptr], align 8
7058 // %.offload_mappers = alloca [3 x ptr], align 8
7059 // ;; target region has been outlined and now we need to
7060 // ;; offload to it via a target task.
7061 // }
7062 // void outlined_device_function(ptr a, ptr b, ptr c) {
7063 // *a = *b + *c
7064 // }
7065 //
7066 // We have to now do the following
7067 // (i) Make an offloading call to outlined_device_function using the OpenMP
7068 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7069 // emitted by emitKernelLaunch
7070 // (ii) Create a task entry point function that calls kernel_launch_function
7071 // and is the entry point for the target task. See
7072 // '@.omp_target_task_proxy_func in the pseudocode below.
7073 // (iii) Create a task with the task entry point created in (ii)
7074 //
7075 // That is we create the following
7076 //
7077 // void user_code_that_offloads(...) {
7078 // %.offload_baseptrs = alloca [3 x ptr], align 8
7079 // %.offload_ptrs = alloca [3 x ptr], align 8
7080 // %.offload_mappers = alloca [3 x ptr], align 8
7081 //
7082 // %structArg = alloca { ptr, ptr, ptr }, align 8
7083 // %strucArg[0] = %.offload_baseptrs
7084 // %strucArg[1] = %.offload_ptrs
7085 // %strucArg[2] = %.offload_mappers
7086 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7087 // @.omp_target_task_proxy_func)
7088 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7089 // dependencies_array = ...
7090 // ;; if nowait not present
7091 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7092 // call @__kmpc_omp_task_begin_if0(...)
7093 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7094 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7095 // }
7096 //
7097 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7098 // ptr %task) {
7099 // %structArg = alloca {ptr, ptr, ptr}
7100 // %shared_data = load (getelementptr %task, 0, 0)
7101 // mempcy(%structArg, %shared_data, sizeof(structArg))
7102 // kernel_launch_function(%thread.id, %structArg)
7103 // }
7104 //
7105 // We need the proxy function because the signature of the task entry point
7106 // expected by kmpc_omp_task is always the same and will be different from
7107 // that of the kernel_launch function.
7108 //
7109 // kernel_launch_function is generated by emitKernelLaunch and has the
7110 // always_inline attribute.
7111 // void kernel_launch_function(thread_id,
7112 // structArg) alwaysinline {
7113 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7114 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7115 // offload_ptrs = load(getelementptr structArg, 0, 1)
7116 // offload_mappers = load(getelementptr structArg, 0, 2)
7117 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7118 // ; offload_mappers
7119 // call i32 @__tgt_target_kernel(...,
7120 // outlined_device_function,
7121 // ptr %kernel_args)
7122 // }
7123 // void outlined_device_function(ptr a, ptr b, ptr c) {
7124 // *a = *b + *c
7125 // }
7126 //
7127 BasicBlock *TargetTaskBodyBB =
7128 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7129 BasicBlock *TargetTaskAllocaBB =
7130 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7131
7132 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7133 TargetTaskAllocaBB->begin());
7134 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7135
7136 OutlineInfo OI;
7137 OI.EntryBB = TargetTaskAllocaBB;
7138 OI.OuterAllocaBB = AllocaIP.getBlock();
7139
7140 // Add the thread ID argument.
7143 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7144
7145 Builder.restoreIP(TargetTaskBodyIP);
7146
7147 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7148 return Err;
7149
7150 OI.ExitBB = Builder.saveIP().getBlock();
7151 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7152 DeviceID](Function &OutlinedFn) mutable {
7153 assert(OutlinedFn.getNumUses() == 1 &&
7154 "there must be a single user for the outlined function");
7155
7156 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7157 bool HasShareds = StaleCI->arg_size() > 1;
7158
7159 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7160
7161 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7162 << "\n");
7163
7164 Builder.SetInsertPoint(StaleCI);
7165
7166 // Gather the arguments for emitting the runtime call.
7167 uint32_t SrcLocStrSize;
7168 Constant *SrcLocStr =
7170 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7171
7172 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7173 //
7174 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7175 // the DeviceID to the deferred task and also since
7176 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7177 Function *TaskAllocFn =
7178 !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7180 OMPRTL___kmpc_omp_target_task_alloc);
7181
7182 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7183 // call.
7184 Value *ThreadID = getOrCreateThreadID(Ident);
7185
7186 // Argument - `sizeof_kmp_task_t` (TaskSize)
7187 // Tasksize refers to the size in bytes of kmp_task_t data structure
7188 // including private vars accessed in task.
7189 // TODO: add kmp_task_t_with_privates (privates)
7190 Value *TaskSize =
7192
7193 // Argument - `sizeof_shareds` (SharedsSize)
7194 // SharedsSize refers to the shareds array size in the kmp_task_t data
7195 // structure.
7196 Value *SharedsSize = Builder.getInt64(0);
7197 if (HasShareds) {
7198 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7199 assert(ArgStructAlloca &&
7200 "Unable to find the alloca instruction corresponding to arguments "
7201 "for extracted function");
7202 auto *ArgStructType =
7203 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7204 assert(ArgStructType && "Unable to find struct type corresponding to "
7205 "arguments for extracted function");
7206 SharedsSize =
7208 }
7209
7210 // Argument - `flags`
7211 // Task is tied iff (Flags & 1) == 1.
7212 // Task is untied iff (Flags & 1) == 0.
7213 // Task is final iff (Flags & 2) == 2.
7214 // Task is not final iff (Flags & 2) == 0.
7215 // A target task is not final and is untied.
7217
7218 // Emit the @__kmpc_omp_task_alloc runtime call
7219 // The runtime call returns a pointer to an area where the task captured
7220 // variables must be copied before the task is run (TaskData)
7221 CallInst *TaskData = nullptr;
7222
7223 SmallVector<llvm::Value *> TaskAllocArgs = {
7224 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7225 /*flags=*/Flags,
7226 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7227 /*task_func=*/ProxyFn};
7228
7229 if (HasNoWait)
7230 TaskAllocArgs.push_back(DeviceID);
7231
7232 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7233
7234 if (HasShareds) {
7235 Value *Shareds = StaleCI->getArgOperand(1);
7236 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7237 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7238 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7239 SharedsSize);
7240 }
7241
7242 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7243
7244 // ---------------------------------------------------------------
7245 // V5.2 13.8 target construct
7246 // If the nowait clause is present, execution of the target task
7247 // may be deferred. If the nowait clause is not present, the target task is
7248 // an included task.
7249 // ---------------------------------------------------------------
7250 // The above means that the lack of a nowait on the target construct
7251 // translates to '#pragma omp task if(0)'
7252 if (!HasNoWait) {
7253 if (DepArray) {
7254 Function *TaskWaitFn =
7255 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7257 TaskWaitFn,
7258 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7259 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7260 /*dep_list=*/DepArray,
7261 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7262 /*noalias_dep_list=*/
7264 }
7265 // Included task.
7266 Function *TaskBeginFn =
7267 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7268 Function *TaskCompleteFn =
7269 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7270 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7271 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7272 CI->setDebugLoc(StaleCI->getDebugLoc());
7273 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7274 } else if (DepArray) {
7275 // HasNoWait - meaning the task may be deferred. Call
7276 // __kmpc_omp_task_with_deps if there are dependencies,
7277 // else call __kmpc_omp_task
7278 Function *TaskFn =
7279 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7281 TaskFn,
7282 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7283 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7285 } else {
7286 // Emit the @__kmpc_omp_task runtime call to spawn the task
7287 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7288 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7289 }
7290
7291 StaleCI->eraseFromParent();
7292 for (Instruction *I : llvm::reverse(ToBeDeleted))
7293 I->eraseFromParent();
7294 };
7295 addOutlineInfo(std::move(OI));
7296
7297 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7298 << *(Builder.GetInsertBlock()) << "\n");
7299 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7301 << "\n");
7302 return Builder.saveIP();
7303}
7304
7306 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7307 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7308 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7309 function_ref<Value *(unsigned int)> CustomMapperCB) {
7310 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7311 DeviceAddrCB, CustomMapperCB);
7312 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7313}
7314
7315static void
7317 OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
7318 Constant *OutlinedFnID, ArrayRef<int32_t> NumTeams,
7322 bool HasNoWait = false) {
7323 // Generate a function call to the host fallback implementation of the target
7324 // region. This is called by the host when no offload entry was generated for
7325 // the target region and when the offloading call fails at runtime.
7326 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7328 Builder.restoreIP(IP);
7329 Builder.CreateCall(OutlinedFn, Args);
7330 return Builder.saveIP();
7331 };
7332
7333 bool HasDependencies = Dependencies.size() > 0;
7334 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7335
7337
7338 auto TaskBodyCB =
7339 [&](Value *DeviceID, Value *RTLoc,
7340 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7342 // emitKernelLaunch makes the necessary runtime call to offload the
7343 // kernel. We then outline all that code into a separate function
7344 // ('kernel_launch_function' in the pseudo code above). This function is
7345 // then called by the target task proxy function (see
7346 // '@.omp_target_task_proxy_func' in the pseudo code above)
7347 // "@.omp_target_task_proxy_func' is generated by
7348 // emitTargetTaskProxyFunction.
7349 if (OutlinedFnID)
7350 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7351 EmitTargetCallFallbackCB, KArgs,
7352 DeviceID, RTLoc, TargetTaskAllocaIP);
7353 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7354 // In this case, we execute the host implementation directly.
7355 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7356 }();
7357
7358 if (!AfterIP)
7359 return AfterIP.takeError();
7360
7361 OMPBuilder.Builder.restoreIP(*AfterIP);
7362 return Error::success();
7363 };
7364
7365 // If we don't have an ID for the target region, it means an offload entry
7366 // wasn't created. In this case we just run the host fallback directly.
7367 if (!OutlinedFnID) {
7369 if (RequiresOuterTargetTask) {
7370 // Arguments that are intended to be directly forwarded to an
7371 // emitKernelLaunch call are pased as nullptr, since
7372 // OutlinedFnID=nullptr results in that call not being done.
7373 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7374 /*RTLoc=*/nullptr, AllocaIP,
7375 Dependencies, HasNoWait);
7376 }
7377 return EmitTargetCallFallbackCB(Builder.saveIP());
7378 }();
7379
7380 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7381 // produce any. The 'if' check enables accessing the returned value.
7382 if (AfterIP)
7383 Builder.restoreIP(*AfterIP);
7384 return;
7385 }
7386
7388 /*RequiresDevicePointerInfo=*/false,
7389 /*SeparateBeginEndCalls=*/true);
7390
7391 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7393 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7394 RTArgs, MapInfo,
7395 /*IsNonContiguous=*/true,
7396 /*ForEndCall=*/false);
7397
7398 SmallVector<Value *, 3> NumTeamsC;
7399 SmallVector<Value *, 3> NumThreadsC;
7400 for (auto V : NumTeams)
7401 NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7402 for (auto V : NumThreads)
7403 NumThreadsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7404
7405 unsigned NumTargetItems = Info.NumberOfPtrs;
7406 // TODO: Use correct device ID
7407 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7408 uint32_t SrcLocStrSize;
7409 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7410 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7411 llvm::omp::IdentFlag(0), 0);
7412 // TODO: Use correct NumIterations
7413 Value *NumIterations = Builder.getInt64(0);
7414 // TODO: Use correct DynCGGroupMem
7415 Value *DynCGGroupMem = Builder.getInt32(0);
7416
7418 NumTargetItems, RTArgs, NumIterations, NumTeamsC, NumThreadsC,
7419 DynCGGroupMem, HasNoWait);
7420
7421 // The presence of certain clauses on the target directive require the
7422 // explicit generation of the target task.
7424 if (RequiresOuterTargetTask)
7425 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7426 Dependencies, HasNoWait);
7427
7428 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7429 EmitTargetCallFallbackCB, KArgs,
7430 DeviceID, RTLoc, AllocaIP);
7431 }();
7432
7433 // Assume no error was returned because TaskBodyCB and
7434 // EmitTargetCallFallbackCB don't produce any. The 'if' check enables
7435 // accessing the returned value.
7436 if (AfterIP)
7437 Builder.restoreIP(*AfterIP);
7438}
7439
7441 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7442 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7443 ArrayRef<int32_t> NumTeams, ArrayRef<int32_t> NumThreads,
7447 SmallVector<DependData> Dependencies, bool HasNowait) {
7448
7449 if (!updateToLocation(Loc))
7450 return InsertPointTy();
7451
7452 Builder.restoreIP(CodeGenIP);
7453
7454 Function *OutlinedFn;
7455 Constant *OutlinedFnID = nullptr;
7456 // The target region is outlined into its own function. The LLVM IR for
7457 // the target region itself is generated using the callbacks CBFunc
7458 // and ArgAccessorFuncCB
7460 *this, Builder, IsOffloadEntry, EntryInfo, OutlinedFn, OutlinedFnID,
7461 Args, CBFunc, ArgAccessorFuncCB))
7462 return Err;
7463
7464 // If we are not on the target device, then we need to generate code
7465 // to make a remote call (offload) to the previously outlined function
7466 // that represents the target region. Do that now.
7467 if (!Config.isTargetDevice())
7468 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
7469 NumThreads, Args, GenMapInfoCB, Dependencies, HasNowait);
7470 return Builder.saveIP();
7471}
7472
7473std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7474 StringRef FirstSeparator,
7475 StringRef Separator) {
7476 SmallString<128> Buffer;
7478 StringRef Sep = FirstSeparator;
7479 for (StringRef Part : Parts) {
7480 OS << Sep << Part;
7481 Sep = Separator;
7482 }
7483 return OS.str().str();
7484}
7485
7486std::string
7488 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7489 Config.separator());
7490}
7491
7494 unsigned AddressSpace) {
7495 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7496 if (Elem.second) {
7497 assert(Elem.second->getValueType() == Ty &&
7498 "OMP internal variable has different type than requested");
7499 } else {
7500 // TODO: investigate the appropriate linkage type used for the global
7501 // variable for possibly changing that to internal or private, or maybe
7502 // create different versions of the function for different OMP internal
7503 // variables.
7504 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7507 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7508 Constant::getNullValue(Ty), Elem.first(),
7509 /*InsertBefore=*/nullptr,
7511 const DataLayout &DL = M.getDataLayout();
7512 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7513 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7514 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7515 Elem.second = GV;
7516 }
7517
7518 return Elem.second;
7519}
7520
7521Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7522 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7523 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7524 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7525}
7526
7529 Value *Null =
7530 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7531 Value *SizeGep =
7532 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7533 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7534 return SizePtrToInt;
7535}
7536
7539 std::string VarName) {
7540 llvm::Constant *MaptypesArrayInit =
7542 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7543 M, MaptypesArrayInit->getType(),
7544 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7545 VarName);
7546 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7547 return MaptypesArrayGlobal;
7548}
7549
7551 InsertPointTy AllocaIP,
7552 unsigned NumOperands,
7553 struct MapperAllocas &MapperAllocas) {
7554 if (!updateToLocation(Loc))
7555 return;
7556
7557 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7558 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7559 Builder.restoreIP(AllocaIP);
7560 AllocaInst *ArgsBase = Builder.CreateAlloca(
7561 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7562 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7563 ".offload_ptrs");
7564 AllocaInst *ArgSizes = Builder.CreateAlloca(
7565 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7566 Builder.restoreIP(Loc.IP);
7567 MapperAllocas.ArgsBase = ArgsBase;
7568 MapperAllocas.Args = Args;
7569 MapperAllocas.ArgSizes = ArgSizes;
7570}
7571
7573 Function *MapperFunc, Value *SrcLocInfo,
7574 Value *MaptypesArg, Value *MapnamesArg,
7576 int64_t DeviceID, unsigned NumOperands) {
7577 if (!updateToLocation(Loc))
7578 return;
7579
7580 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7581 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7582 Value *ArgsBaseGEP =
7584 {Builder.getInt32(0), Builder.getInt32(0)});
7585 Value *ArgsGEP =
7587 {Builder.getInt32(0), Builder.getInt32(0)});
7588 Value *ArgSizesGEP =
7590 {Builder.getInt32(0), Builder.getInt32(0)});
7591 Value *NullPtr =
7592 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7593 Builder.CreateCall(MapperFunc,
7594 {SrcLocInfo, Builder.getInt64(DeviceID),
7595 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7596 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7597}
7598
7600 TargetDataRTArgs &RTArgs,
7601 TargetDataInfo &Info,
7602 bool ForEndCall) {
7603 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7604 "expected region end call to runtime only when end call is separate");
7605 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7606 auto VoidPtrTy = UnqualPtrTy;
7607 auto VoidPtrPtrTy = UnqualPtrTy;
7608 auto Int64Ty = Type::getInt64Ty(M.getContext());
7609 auto Int64PtrTy = UnqualPtrTy;
7610
7611 if (!Info.NumberOfPtrs) {
7612 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7613 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7614 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7615 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7616 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7617 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7618 return;
7619 }
7620
7622 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7623 Info.RTArgs.BasePointersArray,
7624 /*Idx0=*/0, /*Idx1=*/0);
7626 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7627 /*Idx0=*/0,
7628 /*Idx1=*/0);
7630 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7631 /*Idx0=*/0, /*Idx1=*/0);
7633 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7634 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7635 : Info.RTArgs.MapTypesArray,
7636 /*Idx0=*/0,
7637 /*Idx1=*/0);
7638
7639 // Only emit the mapper information arrays if debug information is
7640 // requested.
7641 if (!Info.EmitDebug)
7642 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7643 else
7645 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7646 /*Idx0=*/0,
7647 /*Idx1=*/0);
7648 // If there is no user-defined mapper, set the mapper array to nullptr to
7649 // avoid an unnecessary data privatization
7650 if (!Info.HasMapper)
7651 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7652 else
7653 RTArgs.MappersArray =
7654 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7655}
7656
7658 InsertPointTy CodeGenIP,
7659 MapInfosTy &CombinedInfo,
7660 TargetDataInfo &Info) {
7662 CombinedInfo.NonContigInfo;
7663
7664 // Build an array of struct descriptor_dim and then assign it to
7665 // offload_args.
7666 //
7667 // struct descriptor_dim {
7668 // uint64_t offset;
7669 // uint64_t count;
7670 // uint64_t stride
7671 // };
7672 Type *Int64Ty = Builder.getInt64Ty();
7674 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7675 "struct.descriptor_dim");
7676
7677 enum { OffsetFD = 0, CountFD, StrideFD };
7678 // We need two index variable here since the size of "Dims" is the same as
7679 // the size of Components, however, the size of offset, count, and stride is
7680 // equal to the size of base declaration that is non-contiguous.
7681 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7682 // Skip emitting ir if dimension size is 1 since it cannot be
7683 // non-contiguous.
7684 if (NonContigInfo.Dims[I] == 1)
7685 continue;
7686 Builder.restoreIP(AllocaIP);
7687 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7688 AllocaInst *DimsAddr =
7689 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7690 Builder.restoreIP(CodeGenIP);
7691 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7692 unsigned RevIdx = EE - II - 1;
7693 Value *DimsLVal = Builder.CreateInBoundsGEP(
7694 DimsAddr->getAllocatedType(), DimsAddr,
7695 {Builder.getInt64(0), Builder.getInt64(II)});
7696 // Offset
7697 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7699 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7700 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7701 // Count
7702 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7704 NonContigInfo.Counts[L][RevIdx], CountLVal,
7705 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7706 // Stride
7707 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7709 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7710 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7711 }
7712 // args[I] = &dims
7713 Builder.restoreIP(CodeGenIP);
7715 DimsAddr, Builder.getPtrTy());
7717 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7718 Info.RTArgs.PointersArray, 0, I);
7721 ++L;
7722 }
7723}
7724
7725void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7726 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7727 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7728 BasicBlock *ExitBB, bool IsInit) {
7729 StringRef Prefix = IsInit ? ".init" : ".del";
7730
7731 // Evaluate if this is an array section.
7733 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7734 Value *IsArray =
7735 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7736 Value *DeleteBit = Builder.CreateAnd(
7737 MapType,
7739 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7740 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7741 Value *DeleteCond;
7742 Value *Cond;
7743 if (IsInit) {
7744 // base != begin?
7745 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7746 // IsPtrAndObj?
7747 Value *PtrAndObjBit = Builder.CreateAnd(
7748 MapType,
7750 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7751 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7752 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7753 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7754 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7755 DeleteCond = Builder.CreateIsNull(
7756 DeleteBit,
7757 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7758 } else {
7759 Cond = IsArray;
7760 DeleteCond = Builder.CreateIsNotNull(
7761 DeleteBit,
7762 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7763 }
7764 Cond = Builder.CreateAnd(Cond, DeleteCond);
7765 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7766
7767 emitBlock(BodyBB, MapperFn);
7768 // Get the array size by multiplying element size and element number (i.e., \p
7769 // Size).
7770 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7771 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7772 // memory allocation/deletion purpose only.
7773 Value *MapTypeArg = Builder.CreateAnd(
7774 MapType,
7776 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7777 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7778 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7779 MapTypeArg = Builder.CreateOr(
7780 MapTypeArg,
7782 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7783 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7784
7785 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7786 // data structure.
7787 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7788 ArraySize, MapTypeArg, MapName};
7790 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7791 OffloadingArgs);
7792}
7793
7795 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7796 llvm::Value *BeginArg)>
7797 GenMapInfoCB,
7798 Type *ElemTy, StringRef FuncName,
7799 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7800 SmallVector<Type *> Params;
7801 Params.emplace_back(Builder.getPtrTy());
7802 Params.emplace_back(Builder.getPtrTy());
7803 Params.emplace_back(Builder.getPtrTy());
7806 Params.emplace_back(Builder.getPtrTy());
7807
7808 auto *FnTy =
7809 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7810
7811 SmallString<64> TyStr;
7812 raw_svector_ostream Out(TyStr);
7813 Function *MapperFn =
7815 MapperFn->addFnAttr(Attribute::NoInline);
7816 MapperFn->addFnAttr(Attribute::NoUnwind);
7817 MapperFn->addParamAttr(0, Attribute::NoUndef);
7818 MapperFn->addParamAttr(1, Attribute::NoUndef);
7819 MapperFn->addParamAttr(2, Attribute::NoUndef);
7820 MapperFn->addParamAttr(3, Attribute::NoUndef);
7821 MapperFn->addParamAttr(4, Attribute::NoUndef);
7822 MapperFn->addParamAttr(5, Attribute::NoUndef);
7823
7824 // Start the mapper function code generation.
7825 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7826 auto SavedIP = Builder.saveIP();
7827 Builder.SetInsertPoint(EntryBB);
7828
7829 Value *MapperHandle = MapperFn->getArg(0);
7830 Value *BaseIn = MapperFn->getArg(1);
7831 Value *BeginIn = MapperFn->getArg(2);
7832 Value *Size = MapperFn->getArg(3);
7833 Value *MapType = MapperFn->getArg(4);
7834 Value *MapName = MapperFn->getArg(5);
7835
7836 // Compute the starting and end addresses of array elements.
7837 // Prepare common arguments for array initiation and deletion.
7838 // Convert the size in bytes into the number of array elements.
7839 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7841 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7842 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7843
7844 // Emit array initiation if this is an array section and \p MapType indicates
7845 // that memory allocation is required.
7846 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
7847 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7848 MapType, MapName, ElementSize, HeadBB,
7849 /*IsInit=*/true);
7850
7851 // Emit a for loop to iterate through SizeArg of elements and map all of them.
7852
7853 // Emit the loop header block.
7854 emitBlock(HeadBB, MapperFn);
7855 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
7856 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
7857 // Evaluate whether the initial condition is satisfied.
7858 Value *IsEmpty =
7859 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
7860 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
7861
7862 // Emit the loop body block.
7863 emitBlock(BodyBB, MapperFn);
7864 BasicBlock *LastBB = BodyBB;
7865 PHINode *PtrPHI =
7866 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
7867 PtrPHI->addIncoming(PtrBegin, HeadBB);
7868
7869 // Get map clause information. Fill up the arrays with all mapped variables.
7870 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
7871
7872 // Call the runtime API __tgt_mapper_num_components to get the number of
7873 // pre-existing components.
7874 Value *OffloadingArgs[] = {MapperHandle};
7875 Value *PreviousSize = Builder.CreateCall(
7876 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
7877 OffloadingArgs);
7878 Value *ShiftedPreviousSize =
7880
7881 // Fill up the runtime mapper handle for all components.
7882 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
7883 Value *CurBaseArg =
7884 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
7885 Value *CurBeginArg =
7887 Value *CurSizeArg = Info.Sizes[I];
7888 Value *CurNameArg = Info.Names.size()
7889 ? Info.Names[I]
7891
7892 // Extract the MEMBER_OF field from the map type.
7893 Value *OriMapType = Builder.getInt64(
7894 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7895 Info.Types[I]));
7896 Value *MemberMapType =
7897 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
7898
7899 // Combine the map type inherited from user-defined mapper with that
7900 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
7901 // bits of the \a MapType, which is the input argument of the mapper
7902 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
7903 // bits of MemberMapType.
7904 // [OpenMP 5.0], 1.2.6. map-type decay.
7905 // | alloc | to | from | tofrom | release | delete
7906 // ----------------------------------------------------------
7907 // alloc | alloc | alloc | alloc | alloc | release | delete
7908 // to | alloc | to | alloc | to | release | delete
7909 // from | alloc | alloc | from | from | release | delete
7910 // tofrom | alloc | to | from | tofrom | release | delete
7911 Value *LeftToFrom = Builder.CreateAnd(
7912 MapType,
7914 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7915 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7916 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7917 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
7918 BasicBlock *AllocElseBB =
7919 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
7920 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
7921 BasicBlock *ToElseBB =
7922 BasicBlock::Create(M.getContext(), "omp.type.to.else");
7923 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
7924 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
7925 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
7926 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
7927 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
7928 emitBlock(AllocBB, MapperFn);
7929 Value *AllocMapType = Builder.CreateAnd(
7930 MemberMapType,
7932 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7933 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7934 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7935 Builder.CreateBr(EndBB);
7936 emitBlock(AllocElseBB, MapperFn);
7937 Value *IsTo = Builder.CreateICmpEQ(
7938 LeftToFrom,
7940 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7941 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
7942 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
7943 // In case of to, clear OMP_MAP_FROM.
7944 emitBlock(ToBB, MapperFn);
7945 Value *ToMapType = Builder.CreateAnd(
7946 MemberMapType,
7948 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7949 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7950 Builder.CreateBr(EndBB);
7951 emitBlock(ToElseBB, MapperFn);
7952 Value *IsFrom = Builder.CreateICmpEQ(
7953 LeftToFrom,
7955 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7956 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7957 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
7958 // In case of from, clear OMP_MAP_TO.
7959 emitBlock(FromBB, MapperFn);
7960 Value *FromMapType = Builder.CreateAnd(
7961 MemberMapType,
7963 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7964 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
7965 // In case of tofrom, do nothing.
7966 emitBlock(EndBB, MapperFn);
7967 LastBB = EndBB;
7968 PHINode *CurMapType =
7969 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
7970 CurMapType->addIncoming(AllocMapType, AllocBB);
7971 CurMapType->addIncoming(ToMapType, ToBB);
7972 CurMapType->addIncoming(FromMapType, FromBB);
7973 CurMapType->addIncoming(MemberMapType, ToElseBB);
7974
7975 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
7976 CurSizeArg, CurMapType, CurNameArg};
7977 Function *ChildMapperFn = nullptr;
7978 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
7979 // Call the corresponding mapper function.
7980 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
7981 } else {
7982 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7983 // data structure.
7985 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7986 OffloadingArgs);
7987 }
7988 }
7989
7990 // Update the pointer to point to the next element that needs to be mapped,
7991 // and check whether we have mapped all elements.
7992 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
7993 "omp.arraymap.next");
7994 PtrPHI->addIncoming(PtrNext, LastBB);
7995 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
7996 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
7997 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
7998
7999 emitBlock(ExitBB, MapperFn);
8000 // Emit array deletion if this is an array section and \p MapType indicates
8001 // that deletion is required.
8002 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8003 MapType, MapName, ElementSize, DoneBB,
8004 /*IsInit=*/false);
8005
8006 // Emit the function exit block.
8007 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8008
8010 Builder.restoreIP(SavedIP);
8011 return MapperFn;
8012}
8013
8015 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8016 TargetDataInfo &Info, bool IsNonContiguous,
8017 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8018 function_ref<Value *(unsigned int)> CustomMapperCB) {
8019
8020 // Reset the array information.
8021 Info.clearArrayInfo();
8022 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8023
8024 if (Info.NumberOfPtrs == 0)
8025 return;
8026
8027 Builder.restoreIP(AllocaIP);
8028 // Detect if we have any capture size requiring runtime evaluation of the
8029 // size so that a constant array could be eventually used.
8030 ArrayType *PointerArrayType =
8031 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8032
8033 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8034 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8035
8036 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8037 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8038 AllocaInst *MappersArray = Builder.CreateAlloca(
8039 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8040 Info.RTArgs.MappersArray = MappersArray;
8041
8042 // If we don't have any VLA types or other types that require runtime
8043 // evaluation, we can use a constant array for the map sizes, otherwise we
8044 // need to fill up the arrays as we do for the pointers.
8045 Type *Int64Ty = Builder.getInt64Ty();
8046 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8047 ConstantInt::get(Int64Ty, 0));
8048 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8049 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8050 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8051 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8052 if (IsNonContiguous &&
8053 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8054 CombinedInfo.Types[I] &
8055 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8056 ConstSizes[I] =
8057 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8058 else
8059 ConstSizes[I] = CI;
8060 continue;
8061 }
8062 }
8063 RuntimeSizes.set(I);
8064 }
8065
8066 if (RuntimeSizes.all()) {
8067 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8068 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8069 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8070 Builder.restoreIP(CodeGenIP);
8071 } else {
8072 auto *SizesArrayInit = ConstantArray::get(
8073 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8074 std::string Name = createPlatformSpecificName({"offload_sizes"});
8075 auto *SizesArrayGbl =
8076 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8077 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8078 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8079
8080 if (!RuntimeSizes.any()) {
8081 Info.RTArgs.SizesArray = SizesArrayGbl;
8082 } else {
8083 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8084 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8085 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8087 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8088 Buffer->setAlignment(OffloadSizeAlign);
8089 Builder.restoreIP(CodeGenIP);
8091 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8092 SizesArrayGbl, OffloadSizeAlign,
8094 IndexSize,
8095 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8096
8097 Info.RTArgs.SizesArray = Buffer;
8098 }
8099 Builder.restoreIP(CodeGenIP);
8100 }
8101
8102 // The map types are always constant so we don't need to generate code to
8103 // fill arrays. Instead, we create an array constant.
8105 for (auto mapFlag : CombinedInfo.Types)
8106 Mapping.push_back(
8107 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8108 mapFlag));
8109 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8110 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8111 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8112
8113 // The information types are only built if provided.
8114 if (!CombinedInfo.Names.empty()) {
8115 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8116 auto *MapNamesArrayGbl =
8117 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8118 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8119 Info.EmitDebug = true;
8120 } else {
8121 Info.RTArgs.MapNamesArray =
8123 Info.EmitDebug = false;
8124 }
8125
8126 // If there's a present map type modifier, it must not be applied to the end
8127 // of a region, so generate a separate map type array in that case.
8128 if (Info.separateBeginEndCalls()) {
8129 bool EndMapTypesDiffer = false;
8130 for (uint64_t &Type : Mapping) {
8131 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8132 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8133 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8134 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8135 EndMapTypesDiffer = true;
8136 }
8137 }
8138 if (EndMapTypesDiffer) {
8139 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8140 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8141 }
8142 }
8143
8144 PointerType *PtrTy = Builder.getPtrTy();
8145 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8146 Value *BPVal = CombinedInfo.BasePointers[I];
8148 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8149 0, I);
8150 Builder.CreateAlignedStore(BPVal, BP,
8152
8153 if (Info.requiresDevicePointerInfo()) {
8154 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8155 CodeGenIP = Builder.saveIP();
8156 Builder.restoreIP(AllocaIP);
8157 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8158 Builder.restoreIP(CodeGenIP);
8159 if (DeviceAddrCB)
8160 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8161 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8162 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8163 if (DeviceAddrCB)
8164 DeviceAddrCB(I, BP);
8165 }
8166 }
8167
8168 Value *PVal = CombinedInfo.Pointers[I];
8170 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8171 I);
8172 // TODO: Check alignment correct.
8175
8176 if (RuntimeSizes.test(I)) {
8178 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8179 /*Idx0=*/0,
8180 /*Idx1=*/I);
8182 Int64Ty,
8183 /*isSigned=*/true),
8184 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8185 }
8186 // Fill up the mapper array.
8187 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8188 Value *MFunc = ConstantPointerNull::get(PtrTy);
8189 if (CustomMapperCB)
8190 if (Value *CustomMFunc = CustomMapperCB(I))
8191 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8193 MappersArray->getAllocatedType(), MappersArray,
8194 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8196 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8197 }
8198
8199 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8200 Info.NumberOfPtrs == 0)
8201 return;
8202 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8203}
8204
8207
8208 if (!CurBB || CurBB->getTerminator()) {
8209 // If there is no insert point or the previous block is already
8210 // terminated, don't touch it.
8211 } else {
8212 // Otherwise, create a fall-through branch.
8214 }
8215
8217}
8218
8220 bool IsFinished) {
8222
8223 // Fall out of the current block (if necessary).
8224 emitBranch(BB);
8225
8226 if (IsFinished && BB->use_empty()) {
8227 BB->eraseFromParent();
8228 return;
8229 }
8230
8231 // Place the block after the current block, if possible, or else at
8232 // the end of the function.
8233 if (CurBB && CurBB->getParent())
8234 CurFn->insert(std::next(CurBB->getIterator()), BB);
8235 else
8236 CurFn->insert(CurFn->end(), BB);
8238}
8239
8241 BodyGenCallbackTy ElseGen,
8242 InsertPointTy AllocaIP) {
8243 // If the condition constant folds and can be elided, try to avoid emitting
8244 // the condition and the dead arm of the if/else.
8245 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8246 auto CondConstant = CI->getSExtValue();
8247 if (CondConstant)
8248 return ThenGen(AllocaIP, Builder.saveIP());
8249
8250 return ElseGen(AllocaIP, Builder.saveIP());
8251 }
8252
8254
8255 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8256 // emit the conditional branch.
8257 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8258 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8259 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8260 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8261 // Emit the 'then' code.
8262 emitBlock(ThenBlock, CurFn);
8263 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8264 return Err;
8265 emitBranch(ContBlock);
8266 // Emit the 'else' code if present.
8267 // There is no need to emit line number for unconditional branch.
8268 emitBlock(ElseBlock, CurFn);
8269 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8270 return Err;
8271 // There is no need to emit line number for unconditional branch.
8272 emitBranch(ContBlock);
8273 // Emit the continuation block for code after the if.
8274 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8275 return Error::success();
8276}
8277
8278bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8279 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8282 "Unexpected Atomic Ordering.");
8283
8284 bool Flush = false;
8286
8287 switch (AK) {
8288 case Read:
8291 FlushAO = AtomicOrdering::Acquire;
8292 Flush = true;
8293 }
8294 break;
8295 case Write:
8296 case Compare:
8297 case Update:
8300 FlushAO = AtomicOrdering::Release;
8301 Flush = true;
8302 }
8303 break;
8304 case Capture:
8305 switch (AO) {
8307 FlushAO = AtomicOrdering::Acquire;
8308 Flush = true;
8309 break;
8311 FlushAO = AtomicOrdering::Release;
8312 Flush = true;
8313 break;
8317 Flush = true;
8318 break;
8319 default:
8320 // do nothing - leave silently.
8321 break;
8322 }
8323 }
8324
8325 if (Flush) {
8326 // Currently Flush RT call still doesn't take memory_ordering, so for when
8327 // that happens, this tries to do the resolution of which atomic ordering
8328 // to use with but issue the flush call
8329 // TODO: pass `FlushAO` after memory ordering support is added
8330 (void)FlushAO;
8331 emitFlush(Loc);
8332 }
8333
8334 // for AO == AtomicOrdering::Monotonic and all other case combinations
8335 // do nothing
8336 return Flush;
8337}
8338
8342 AtomicOrdering AO) {
8343 if (!updateToLocation(Loc))
8344 return Loc.IP;
8345
8346 assert(X.Var->getType()->isPointerTy() &&
8347 "OMP Atomic expects a pointer to target memory");
8348 Type *XElemTy = X.ElemTy;
8349 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8350 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8351 "OMP atomic read expected a scalar type");
8352
8353 Value *XRead = nullptr;
8354
8355 if (XElemTy->isIntegerTy()) {
8356 LoadInst *XLD =
8357 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8358 XLD->setAtomic(AO);
8359 XRead = cast<Value>(XLD);
8360 } else if (XElemTy->isStructTy()) {
8361 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8362 // target does not support `atomicrmw` of the size of the struct
8363 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8364 OldVal->setAtomic(AO);
8365 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8366 unsigned LoadSize =
8367 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8368 OpenMPIRBuilder::AtomicInfo atomicInfo(
8369 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8370 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8371 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8372 XRead = AtomicLoadRes.first;
8373 OldVal->eraseFromParent();
8374 } else {
8375 // We need to perform atomic op as integer
8376 IntegerType *IntCastTy =
8378 LoadInst *XLoad =
8379 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8380 XLoad->setAtomic(AO);
8381 if (XElemTy->isFloatingPointTy()) {
8382 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8383 } else {
8384 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8385 }
8386 }
8387 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8388 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8389 return Builder.saveIP();
8390}
8391
8394 AtomicOpValue &X, Value *Expr,
8395 AtomicOrdering AO) {
8396 if (!updateToLocation(Loc))
8397 return Loc.IP;
8398
8399 assert(X.Var->getType()->isPointerTy() &&
8400 "OMP Atomic expects a pointer to target memory");
8401 Type *XElemTy = X.ElemTy;
8402 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8403 XElemTy->isPointerTy()) &&
8404 "OMP atomic write expected a scalar type");
8405
8406 if (XElemTy->isIntegerTy()) {
8407 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8408 XSt->setAtomic(AO);
8409 } else {
8410 // We need to bitcast and perform atomic op as integers
8411 IntegerType *IntCastTy =
8413 Value *ExprCast =
8414 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8415 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8416 XSt->setAtomic(AO);
8417 }
8418
8419 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8420 return Builder.saveIP();
8421}
8422
8424 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8425 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8426 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8427 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8428 if (!updateToLocation(Loc))
8429 return Loc.IP;
8430
8431 LLVM_DEBUG({
8432 Type *XTy = X.Var->getType();
8433 assert(XTy->isPointerTy() &&
8434 "OMP Atomic expects a pointer to target memory");
8435 Type *XElemTy = X.ElemTy;
8436 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8437 XElemTy->isPointerTy()) &&
8438 "OMP atomic update expected a scalar type");
8439 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8440 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8441 "OpenMP atomic does not support LT or GT operations");
8442 });
8443
8445 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8446 X.IsVolatile, IsXBinopExpr);
8447 if (!AtomicResult)
8448 return AtomicResult.takeError();
8449 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8450 return Builder.saveIP();
8451}
8452
8453// FIXME: Duplicating AtomicExpand
8454Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8455 AtomicRMWInst::BinOp RMWOp) {
8456 switch (RMWOp) {
8457 case AtomicRMWInst::Add:
8458 return Builder.CreateAdd(Src1, Src2);
8459 case AtomicRMWInst::Sub:
8460 return Builder.CreateSub(Src1, Src2);
8461 case AtomicRMWInst::And:
8462 return Builder.CreateAnd(Src1, Src2);
8464 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8465 case AtomicRMWInst::Or:
8466 return Builder.CreateOr(Src1, Src2);
8467 case AtomicRMWInst::Xor:
8468 return Builder.CreateXor(Src1, Src2);
8473 case AtomicRMWInst::Max:
8474 case AtomicRMWInst::Min:
8483 llvm_unreachable("Unsupported atomic update operation");
8484 }
8485 llvm_unreachable("Unsupported atomic update operation");
8486}
8487
8488Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8489 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8491 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8492 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8493 // or a complex datatype.
8494 bool emitRMWOp = false;
8495 switch (RMWOp) {
8496 case AtomicRMWInst::Add:
8497 case AtomicRMWInst::And:
8499 case AtomicRMWInst::Or:
8500 case AtomicRMWInst::Xor:
8502 emitRMWOp = XElemTy;
8503 break;
8504 case AtomicRMWInst::Sub:
8505 emitRMWOp = (IsXBinopExpr && XElemTy);
8506 break;
8507 default:
8508 emitRMWOp = false;
8509 }
8510 emitRMWOp &= XElemTy->isIntegerTy();
8511
8512 std::pair<Value *, Value *> Res;
8513 if (emitRMWOp) {
8514 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8515 // not needed except in case of postfix captures. Generate anyway for
8516 // consistency with the else part. Will be removed with any DCE pass.
8517 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8518 if (RMWOp == AtomicRMWInst::Xchg)
8519 Res.second = Res.first;
8520 else
8521 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8522 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8523 XElemTy->isStructTy()) {
8524 LoadInst *OldVal =
8525 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8526 OldVal->setAtomic(AO);
8527 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8528 unsigned LoadSize =
8529 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8530
8531 OpenMPIRBuilder::AtomicInfo atomicInfo(
8532 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8533 OldVal->getAlign(), true /* UseLibcall */, X);
8534 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8536 Instruction *CurBBTI = CurBB->getTerminator();
8537 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8538 BasicBlock *ExitBB =
8539 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8540 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8541 X->getName() + ".atomic.cont");
8542 ContBB->getTerminator()->eraseFromParent();
8543 Builder.restoreIP(AllocaIP);
8544 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8545 NewAtomicAddr->setName(X->getName() + "x.new.val");
8546 Builder.SetInsertPoint(ContBB);
8547 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8548 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8549 Value *OldExprVal = PHI;
8550 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8551 if (!CBResult)
8552 return CBResult.takeError();
8553 Value *Upd = *CBResult;
8554 Builder.CreateStore(Upd, NewAtomicAddr);
8557 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8558 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8559 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8560 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8561 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8562 OldVal->eraseFromParent();
8563 Res.first = OldExprVal;
8564 Res.second = Upd;
8565
8566 if (UnreachableInst *ExitTI =
8567 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8568 CurBBTI->eraseFromParent();
8569 Builder.SetInsertPoint(ExitBB);
8570 } else {
8571 Builder.SetInsertPoint(ExitTI);
8572 }
8573 } else {
8574 IntegerType *IntCastTy =
8576 LoadInst *OldVal =
8577 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8578 OldVal->setAtomic(AO);
8579 // CurBB
8580 // | /---\
8581 // ContBB |
8582 // | \---/
8583 // ExitBB
8585 Instruction *CurBBTI = CurBB->getTerminator();
8586 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8587 BasicBlock *ExitBB =
8588 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8589 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8590 X->getName() + ".atomic.cont");
8591 ContBB->getTerminator()->eraseFromParent();
8592 Builder.restoreIP(AllocaIP);
8593 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8594 NewAtomicAddr->setName(X->getName() + "x.new.val");
8595 Builder.SetInsertPoint(ContBB);
8596 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8597 PHI->addIncoming(OldVal, CurBB);
8598 bool IsIntTy = XElemTy->isIntegerTy();
8599 Value *OldExprVal = PHI;
8600 if (!IsIntTy) {
8601 if (XElemTy->isFloatingPointTy()) {
8602 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8603 X->getName() + ".atomic.fltCast");
8604 } else {
8605 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8606 X->getName() + ".atomic.ptrCast");
8607 }
8608 }
8609
8610 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8611 if (!CBResult)
8612 return CBResult.takeError();
8613 Value *Upd = *CBResult;
8614 Builder.CreateStore(Upd, NewAtomicAddr);
8615 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8619 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8620 Result->setVolatile(VolatileX);
8621 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8622 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8623 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8624 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8625
8626 Res.first = OldExprVal;
8627 Res.second = Upd;
8628
8629 // set Insertion point in exit block
8630 if (UnreachableInst *ExitTI =
8631 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8632 CurBBTI->eraseFromParent();
8633 Builder.SetInsertPoint(ExitBB);
8634 } else {
8635 Builder.SetInsertPoint(ExitTI);
8636 }
8637 }
8638
8639 return Res;
8640}
8641
8643 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8644 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8646 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8647 if (!updateToLocation(Loc))
8648 return Loc.IP;
8649
8650 LLVM_DEBUG({
8651 Type *XTy = X.Var->getType();
8652 assert(XTy->isPointerTy() &&
8653 "OMP Atomic expects a pointer to target memory");
8654 Type *XElemTy = X.ElemTy;
8655 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8656 XElemTy->isPointerTy()) &&
8657 "OMP atomic capture expected a scalar type");
8658 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8659 "OpenMP atomic does not support LT or GT operations");
8660 });
8661
8662 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8663 // 'x' is simply atomically rewritten with 'expr'.
8664 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8666 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8667 X.IsVolatile, IsXBinopExpr);
8668 if (!AtomicResult)
8669 return AtomicResult.takeError();
8670 Value *CapturedVal =
8671 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8672 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8673
8674 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8675 return Builder.saveIP();
8676}
8677
8681 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8682 bool IsFailOnly) {
8683
8685 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8686 IsPostfixUpdate, IsFailOnly, Failure);
8687}
8688
8692 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8693 bool IsFailOnly, AtomicOrdering Failure) {
8694
8695 if (!updateToLocation(Loc))
8696 return Loc.IP;
8697
8698 assert(X.Var->getType()->isPointerTy() &&
8699 "OMP atomic expects a pointer to target memory");
8700 // compare capture
8701 if (V.Var) {
8702 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8703 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8704 }
8705
8706 bool IsInteger = E->getType()->isIntegerTy();
8707
8708 if (Op == OMPAtomicCompareOp::EQ) {
8709 AtomicCmpXchgInst *Result = nullptr;
8710 if (!IsInteger) {
8711 IntegerType *IntCastTy =
8712 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8713 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8714 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8715 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8716 AO, Failure);
8717 } else {
8718 Result =
8719 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8720 }
8721
8722 if (V.Var) {
8723 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8724 if (!IsInteger)
8725 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8726 assert(OldValue->getType() == V.ElemTy &&
8727 "OldValue and V must be of same type");
8728 if (IsPostfixUpdate) {
8729 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8730 } else {
8731 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8732 if (IsFailOnly) {
8733 // CurBB----
8734 // | |
8735 // v |
8736 // ContBB |
8737 // | |
8738 // v |
8739 // ExitBB <-
8740 //
8741 // where ContBB only contains the store of old value to 'v'.
8743 Instruction *CurBBTI = CurBB->getTerminator();
8744 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8745 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8746 CurBBTI, X.Var->getName() + ".atomic.exit");
8747 BasicBlock *ContBB = CurBB->splitBasicBlock(
8748 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8749 ContBB->getTerminator()->eraseFromParent();
8750 CurBB->getTerminator()->eraseFromParent();
8751
8752 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8753
8754 Builder.SetInsertPoint(ContBB);
8755 Builder.CreateStore(OldValue, V.Var);
8756 Builder.CreateBr(ExitBB);
8757
8758 if (UnreachableInst *ExitTI =
8759 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8760 CurBBTI->eraseFromParent();
8761 Builder.SetInsertPoint(ExitBB);
8762 } else {
8763 Builder.SetInsertPoint(ExitTI);
8764 }
8765 } else {
8766 Value *CapturedValue =
8767 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8768 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8769 }
8770 }
8771 }
8772 // The comparison result has to be stored.
8773 if (R.Var) {
8774 assert(R.Var->getType()->isPointerTy() &&
8775 "r.var must be of pointer type");
8776 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8777
8778 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8779 Value *ResultCast = R.IsSigned
8780 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8781 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8782 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8783 }
8784 } else {
8785 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8786 "Op should be either max or min at this point");
8787 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8788
8789 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8790 // Let's take max as example.
8791 // OpenMP form:
8792 // x = x > expr ? expr : x;
8793 // LLVM form:
8794 // *ptr = *ptr > val ? *ptr : val;
8795 // We need to transform to LLVM form.
8796 // x = x <= expr ? x : expr;
8798 if (IsXBinopExpr) {
8799 if (IsInteger) {
8800 if (X.IsSigned)
8801 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8803 else
8804 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8806 } else {
8807 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8809 }
8810 } else {
8811 if (IsInteger) {
8812 if (X.IsSigned)
8813 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8815 else
8816 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8818 } else {
8819 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8821 }
8822 }
8823
8824 AtomicRMWInst *OldValue =
8825 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8826 if (V.Var) {
8827 Value *CapturedValue = nullptr;
8828 if (IsPostfixUpdate) {
8829 CapturedValue = OldValue;
8830 } else {
8831 CmpInst::Predicate Pred;
8832 switch (NewOp) {
8833 case AtomicRMWInst::Max:
8834 Pred = CmpInst::ICMP_SGT;
8835 break;
8837 Pred = CmpInst::ICMP_UGT;
8838 break;
8840 Pred = CmpInst::FCMP_OGT;
8841 break;
8842 case AtomicRMWInst::Min:
8843 Pred = CmpInst::ICMP_SLT;
8844 break;
8846 Pred = CmpInst::ICMP_ULT;
8847 break;
8849 Pred = CmpInst::FCMP_OLT;
8850 break;
8851 default:
8852 llvm_unreachable("unexpected comparison op");
8853 }
8854 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8855 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8856 }
8857 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8858 }
8859 }
8860
8861 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8862
8863 return Builder.saveIP();
8864}
8865
8868 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8869 Value *NumTeamsUpper, Value *ThreadLimit,
8870 Value *IfExpr) {
8871 if (!updateToLocation(Loc))
8872 return InsertPointTy();
8873
8874 uint32_t SrcLocStrSize;
8875 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8876 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8877 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8878
8879 // Outer allocation basicblock is the entry block of the current function.
8880 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8881 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8882 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8883 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8884 }
8885
8886 // The current basic block is split into four basic blocks. After outlining,
8887 // they will be mapped as follows:
8888 // ```
8889 // def current_fn() {
8890 // current_basic_block:
8891 // br label %teams.exit
8892 // teams.exit:
8893 // ; instructions after teams
8894 // }
8895 //
8896 // def outlined_fn() {
8897 // teams.alloca:
8898 // br label %teams.body
8899 // teams.body:
8900 // ; instructions within teams body
8901 // }
8902 // ```
8903 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8904 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8905 BasicBlock *AllocaBB =
8906 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8907
8908 bool SubClausesPresent =
8909 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8910 // Push num_teams
8911 if (!Config.isTargetDevice() && SubClausesPresent) {
8912 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8913 "if lowerbound is non-null, then upperbound must also be non-null "
8914 "for bounds on num_teams");
8915
8916 if (NumTeamsUpper == nullptr)
8917 NumTeamsUpper = Builder.getInt32(0);
8918
8919 if (NumTeamsLower == nullptr)
8920 NumTeamsLower = NumTeamsUpper;
8921
8922 if (IfExpr) {
8923 assert(IfExpr->getType()->isIntegerTy() &&
8924 "argument to if clause must be an integer value");
8925
8926 // upper = ifexpr ? upper : 1
8927 if (IfExpr->getType() != Int1)
8928 IfExpr = Builder.CreateICmpNE(IfExpr,
8929 ConstantInt::get(IfExpr->getType(), 0));
8930 NumTeamsUpper = Builder.CreateSelect(
8931 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8932
8933 // lower = ifexpr ? lower : 1
8934 NumTeamsLower = Builder.CreateSelect(
8935 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8936 }
8937
8938 if (ThreadLimit == nullptr)
8939 ThreadLimit = Builder.getInt32(0);
8940
8941 Value *ThreadNum = getOrCreateThreadID(Ident);
8943 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
8944 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
8945 }
8946 // Generate the body of teams.
8947 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
8948 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
8949 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
8950 return Err;
8951
8952 OutlineInfo OI;
8953 OI.EntryBB = AllocaBB;
8954 OI.ExitBB = ExitBB;
8955 OI.OuterAllocaBB = &OuterAllocaBB;
8956
8957 // Insert fake values for global tid and bound tid.
8959 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
8961 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
8963 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
8964
8965 auto HostPostOutlineCB = [this, Ident,
8966 ToBeDeleted](Function &OutlinedFn) mutable {
8967 // The stale call instruction will be replaced with a new call instruction
8968 // for runtime call with the outlined function.
8969
8970 assert(OutlinedFn.getNumUses() == 1 &&
8971 "there must be a single user for the outlined function");
8972 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8973 ToBeDeleted.push_back(StaleCI);
8974
8975 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
8976 "Outlined function must have two or three arguments only");
8977
8978 bool HasShared = OutlinedFn.arg_size() == 3;
8979
8980 OutlinedFn.getArg(0)->setName("global.tid.ptr");
8981 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
8982 if (HasShared)
8983 OutlinedFn.getArg(2)->setName("data");
8984
8985 // Call to the runtime function for teams in the current function.
8986 assert(StaleCI && "Error while outlining - no CallInst user found for the "
8987 "outlined function.");
8988 Builder.SetInsertPoint(StaleCI);
8989 SmallVector<Value *> Args = {
8990 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
8991 if (HasShared)
8992 Args.push_back(StaleCI->getArgOperand(2));
8994 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
8995 Args);
8996
8997 for (Instruction *I : llvm::reverse(ToBeDeleted))
8998 I->eraseFromParent();
8999 };
9000
9001 if (!Config.isTargetDevice())
9002 OI.PostOutlineCB = HostPostOutlineCB;
9003
9004 addOutlineInfo(std::move(OI));
9005
9006 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9007
9008 return Builder.saveIP();
9009}
9010
9013 std::string VarName) {
9014 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9016 Names.size()),
9017 Names);
9018 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9019 M, MapNamesArrayInit->getType(),
9020 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9021 VarName);
9022 return MapNamesArrayGlobal;
9023}
9024
9025// Create all simple and struct types exposed by the runtime and remember
9026// the llvm::PointerTypes of them for easy access later.
9027void OpenMPIRBuilder::initializeTypes(Module &M) {
9028 LLVMContext &Ctx = M.getContext();
9029 StructType *T;
9030#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9031#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9032 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9033 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
9034#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9035 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9036 VarName##Ptr = PointerType::getUnqual(VarName);
9037#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9038 T = StructType::getTypeByName(Ctx, StructName); \
9039 if (!T) \
9040 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9041 VarName = T; \
9042 VarName##Ptr = PointerType::getUnqual(T);
9043#include "llvm/Frontend/OpenMP/OMPKinds.def"
9044}
9045
9048 SmallVectorImpl<BasicBlock *> &BlockVector) {
9050 BlockSet.insert(EntryBB);
9051 BlockSet.insert(ExitBB);
9052
9053 Worklist.push_back(EntryBB);
9054 while (!Worklist.empty()) {
9055 BasicBlock *BB = Worklist.pop_back_val();
9056 BlockVector.push_back(BB);
9057 for (BasicBlock *SuccBB : successors(BB))
9058 if (BlockSet.insert(SuccBB).second)
9059 Worklist.push_back(SuccBB);
9060 }
9061}
9062
9064 uint64_t Size, int32_t Flags,
9066 StringRef Name) {
9067 if (!Config.isGPU()) {
9069 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9070 "omp_offloading_entries");
9071 return;
9072 }
9073 // TODO: Add support for global variables on the device after declare target
9074 // support.
9075 Function *Fn = dyn_cast<Function>(Addr);
9076 if (!Fn)
9077 return;
9078
9079 Module &M = *(Fn->getParent());
9080 LLVMContext &Ctx = M.getContext();
9081
9082 // Get "nvvm.annotations" metadata node.
9083 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9084
9085 Metadata *MDVals[] = {
9086 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9087 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9088 // Append metadata to nvvm.annotations.
9089 MD->addOperand(MDNode::get(Ctx, MDVals));
9090
9091 // Add a function attribute for the kernel.
9092 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9093 if (T.isAMDGCN())
9094 Fn->addFnAttr("uniform-work-group-size", "true");
9095 Fn->addFnAttr(Attribute::MustProgress);
9096}
9097
9098// We only generate metadata for function that contain target regions.
9101
9102 // If there are no entries, we don't need to do anything.
9104 return;
9105
9109 16>
9110 OrderedEntries(OffloadInfoManager.size());
9111
9112 // Auxiliary methods to create metadata values and strings.
9113 auto &&GetMDInt = [this](unsigned V) {
9114 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9115 };
9116
9117 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9118
9119 // Create the offloading info metadata node.
9120 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9121 auto &&TargetRegionMetadataEmitter =
9122 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9123 const TargetRegionEntryInfo &EntryInfo,
9125 // Generate metadata for target regions. Each entry of this metadata
9126 // contains:
9127 // - Entry 0 -> Kind of this type of metadata (0).
9128 // - Entry 1 -> Device ID of the file where the entry was identified.
9129 // - Entry 2 -> File ID of the file where the entry was identified.
9130 // - Entry 3 -> Mangled name of the function where the entry was
9131 // identified.
9132 // - Entry 4 -> Line in the file where the entry was identified.
9133 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9134 // - Entry 6 -> Order the entry was created.
9135 // The first element of the metadata node is the kind.
9136 Metadata *Ops[] = {
9137 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9138 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9139 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9140 GetMDInt(E.getOrder())};
9141
9142 // Save this entry in the right position of the ordered entries array.
9143 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9144
9145 // Add metadata to the named metadata node.
9146 MD->addOperand(MDNode::get(C, Ops));
9147 };
9148
9149 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9150
9151 // Create function that emits metadata for each device global variable entry;
9152 auto &&DeviceGlobalVarMetadataEmitter =
9153 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9154 StringRef MangledName,
9156 // Generate metadata for global variables. Each entry of this metadata
9157 // contains:
9158 // - Entry 0 -> Kind of this type of metadata (1).
9159 // - Entry 1 -> Mangled name of the variable.
9160 // - Entry 2 -> Declare target kind.
9161 // - Entry 3 -> Order the entry was created.
9162 // The first element of the metadata node is the kind.
9163 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9164 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9165
9166 // Save this entry in the right position of the ordered entries array.
9167 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9168 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9169
9170 // Add metadata to the named metadata node.
9171 MD->addOperand(MDNode::get(C, Ops));
9172 };
9173
9175 DeviceGlobalVarMetadataEmitter);
9176
9177 for (const auto &E : OrderedEntries) {
9178 assert(E.first && "All ordered entries must exist!");
9179 if (const auto *CE =
9180 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9181 E.first)) {
9182 if (!CE->getID() || !CE->getAddress()) {
9183 // Do not blame the entry if the parent funtion is not emitted.
9184 TargetRegionEntryInfo EntryInfo = E.second;
9185 StringRef FnName = EntryInfo.ParentName;
9186 if (!M.getNamedValue(FnName))
9187 continue;
9188 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9189 continue;
9190 }
9191 createOffloadEntry(CE->getID(), CE->getAddress(),
9192 /*Size=*/0, CE->getFlags(),
9194 } else if (const auto *CE = dyn_cast<
9196 E.first)) {
9199 CE->getFlags());
9200 switch (Flags) {
9204 continue;
9205 if (!CE->getAddress()) {
9206 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9207 continue;
9208 }
9209 // The vaiable has no definition - no need to add the entry.
9210 if (CE->getVarSize() == 0)
9211 continue;
9212 break;
9214 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9215 (!Config.isTargetDevice() && CE->getAddress())) &&
9216 "Declaret target link address is set.");
9217 if (Config.isTargetDevice())
9218 continue;
9219 if (!CE->getAddress()) {
9221 continue;
9222 }
9223 break;
9224 default:
9225 break;
9226 }
9227
9228 // Hidden or internal symbols on the device are not externally visible.
9229 // We should not attempt to register them by creating an offloading
9230 // entry. Indirect variables are handled separately on the device.
9231 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9232 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9234 continue;
9235
9236 // Indirect globals need to use a special name that doesn't match the name
9237 // of the associated host global.
9239 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9240 Flags, CE->getLinkage(), CE->getVarName());
9241 else
9242 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9243 Flags, CE->getLinkage());
9244
9245 } else {
9246 llvm_unreachable("Unsupported entry kind.");
9247 }
9248 }
9249
9250 // Emit requires directive globals to a special entry so the runtime can
9251 // register them when the device image is loaded.
9252 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9253 // entries should be redesigned to better suit this use-case.
9257 /*Name=*/"",
9259 Config.getRequiresFlags(), "omp_offloading_entries");
9260}
9261
9263 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9264 unsigned FileID, unsigned Line, unsigned Count) {
9266 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9267 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9268 if (Count)
9269 OS << "_" << Count;
9270}
9271
9274 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9276 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9277 EntryInfo.Line, NewCount);
9278}
9279
9282 StringRef ParentName) {
9284 auto FileIDInfo = CallBack();
9285 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9286 report_fatal_error(("Unable to get unique ID for file, during "
9287 "getTargetEntryUniqueInfo, error message: " +
9288 EC.message())
9289 .c_str());
9290 }
9291
9292 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9293 std::get<1>(FileIDInfo));
9294}
9295
9297 unsigned Offset = 0;
9298 for (uint64_t Remain =
9299 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9301 !(Remain & 1); Remain = Remain >> 1)
9302 Offset++;
9303 return Offset;
9304}
9305
9308 // Rotate by getFlagMemberOffset() bits.
9309 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9310 << getFlagMemberOffset());
9311}
9312
9315 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9316 // If the entry is PTR_AND_OBJ but has not been marked with the special
9317 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9318 // marked as MEMBER_OF.
9319 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9321 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9324 return;
9325
9326 // Reset the placeholder value to prepare the flag for the assignment of the
9327 // proper MEMBER_OF value.
9328 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9329 Flags |= MemberOfFlag;
9330}
9331
9335 bool IsDeclaration, bool IsExternallyVisible,
9336 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9337 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9338 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9339 std::function<Constant *()> GlobalInitializer,
9340 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9341 // TODO: convert this to utilise the IRBuilder Config rather than
9342 // a passed down argument.
9343 if (OpenMPSIMD)
9344 return nullptr;
9345
9348 CaptureClause ==
9351 SmallString<64> PtrName;
9352 {
9353 raw_svector_ostream OS(PtrName);
9354 OS << MangledName;
9355 if (!IsExternallyVisible)
9356 OS << format("_%x", EntryInfo.FileID);
9357 OS << "_decl_tgt_ref_ptr";
9358 }
9359
9360 Value *Ptr = M.getNamedValue(PtrName);
9361
9362 if (!Ptr) {
9363 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9364 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9365
9366 auto *GV = cast<GlobalVariable>(Ptr);
9367 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9368
9369 if (!Config.isTargetDevice()) {
9370 if (GlobalInitializer)
9371 GV->setInitializer(GlobalInitializer());
9372 else
9373 GV->setInitializer(GlobalValue);
9374 }
9375
9377 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9378 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9379 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9380 }
9381
9382 return cast<Constant>(Ptr);
9383 }
9384
9385 return nullptr;
9386}
9387
9391 bool IsDeclaration, bool IsExternallyVisible,
9392 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9393 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9394 std::vector<Triple> TargetTriple,
9395 std::function<Constant *()> GlobalInitializer,
9396 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9397 Constant *Addr) {
9399 (TargetTriple.empty() && !Config.isTargetDevice()))
9400 return;
9401
9403 StringRef VarName;
9404 int64_t VarSize;
9406
9408 CaptureClause ==
9412 VarName = MangledName;
9413 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9414
9415 if (!IsDeclaration)
9416 VarSize = divideCeil(
9418 else
9419 VarSize = 0;
9420 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9421
9422 // This is a workaround carried over from Clang which prevents undesired
9423 // optimisation of internal variables.
9424 if (Config.isTargetDevice() &&
9425 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9426 // Do not create a "ref-variable" if the original is not also available
9427 // on the host.
9429 return;
9430
9431 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9432
9433 if (!M.getNamedValue(RefName)) {
9434 Constant *AddrRef =
9435 getOrCreateInternalVariable(Addr->getType(), RefName);
9436 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9437 GvAddrRef->setConstant(true);
9438 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9439 GvAddrRef->setInitializer(Addr);
9440 GeneratedRefs.push_back(GvAddrRef);
9441 }
9442 }
9443 } else {
9446 else
9448
9449 if (Config.isTargetDevice()) {
9450 VarName = (Addr) ? Addr->getName() : "";
9451 Addr = nullptr;
9452 } else {
9454 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9455 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9456 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9457 VarName = (Addr) ? Addr->getName() : "";
9458 }
9459 VarSize = M.getDataLayout().getPointerSize();
9461 }
9462
9464 Flags, Linkage);
9465}
9466
9467/// Loads all the offload entries information from the host IR
9468/// metadata.
9470 // If we are in target mode, load the metadata from the host IR. This code has
9471 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9472
9474 if (!MD)
9475 return;
9476
9477 for (MDNode *MN : MD->operands()) {
9478 auto &&GetMDInt = [MN](unsigned Idx) {
9479 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9480 return cast<ConstantInt>(V->getValue())->getZExtValue();
9481 };
9482
9483 auto &&GetMDString = [MN](unsigned Idx) {
9484 auto *V = cast<MDString>(MN->getOperand(Idx));
9485 return V->getString();
9486 };
9487
9488 switch (GetMDInt(0)) {
9489 default:
9490 llvm_unreachable("Unexpected metadata!");
9491 break;
9494 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9495 /*DeviceID=*/GetMDInt(1),
9496 /*FileID=*/GetMDInt(2),
9497 /*Line=*/GetMDInt(4),
9498 /*Count=*/GetMDInt(5));
9500 /*Order=*/GetMDInt(6));
9501 break;
9502 }
9506 /*MangledName=*/GetMDString(1),
9508 /*Flags=*/GetMDInt(2)),
9509 /*Order=*/GetMDInt(3));
9510 break;
9511 }
9512 }
9513}
9514
9516 if (HostFilePath.empty())
9517 return;
9518
9519 auto Buf = MemoryBuffer::getFile(HostFilePath);
9520 if (std::error_code Err = Buf.getError()) {
9521 report_fatal_error(("error opening host file from host file path inside of "
9522 "OpenMPIRBuilder: " +
9523 Err.message())
9524 .c_str());
9525 }
9526
9527 LLVMContext Ctx;
9529 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9530 if (std::error_code Err = M.getError()) {
9532 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9533 .c_str());
9534 }
9535
9536 loadOffloadInfoMetadata(*M.get());
9537}
9538
9539//===----------------------------------------------------------------------===//
9540// OffloadEntriesInfoManager
9541//===----------------------------------------------------------------------===//
9542
9544 return OffloadEntriesTargetRegion.empty() &&
9545 OffloadEntriesDeviceGlobalVar.empty();
9546}
9547
9548unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9549 const TargetRegionEntryInfo &EntryInfo) const {
9550 auto It = OffloadEntriesTargetRegionCount.find(
9551 getTargetRegionEntryCountKey(EntryInfo));
9552 if (It == OffloadEntriesTargetRegionCount.end())
9553 return 0;
9554 return It->second;
9555}
9556
9557void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9558 const TargetRegionEntryInfo &EntryInfo) {
9559 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9560 EntryInfo.Count + 1;
9561}
9562
9563/// Initialize target region entry.
9565 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9566 OffloadEntriesTargetRegion[EntryInfo] =
9567 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9568 OMPTargetRegionEntryTargetRegion);
9569 ++OffloadingEntriesNum;
9570}
9571
9575 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9576
9577 // Update the EntryInfo with the next available count for this location.
9578 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9579
9580 // If we are emitting code for a target, the entry is already initialized,
9581 // only has to be registered.
9582 if (OMPBuilder->Config.isTargetDevice()) {
9583 // This could happen if the device compilation is invoked standalone.
9584 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9585 return;
9586 }
9587 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9588 Entry.setAddress(Addr);
9589 Entry.setID(ID);
9590 Entry.setFlags(Flags);
9591 } else {
9593 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9594 return;
9595 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9596 "Target region entry already registered!");
9597 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9598 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9599 ++OffloadingEntriesNum;
9600 }
9601 incrementTargetRegionEntryInfoCount(EntryInfo);
9602}
9603
9605 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9606
9607 // Update the EntryInfo with the next available count for this location.
9608 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9609
9610 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9611 if (It == OffloadEntriesTargetRegion.end()) {
9612 return false;
9613 }
9614 // Fail if this entry is already registered.
9615 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9616 return false;
9617 return true;
9618}
9619
9621 const OffloadTargetRegionEntryInfoActTy &Action) {
9622 // Scan all target region entries and perform the provided action.
9623 for (const auto &It : OffloadEntriesTargetRegion) {
9624 Action(It.first, It.second);
9625 }
9626}
9627
9629 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9630 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9631 ++OffloadingEntriesNum;
9632}
9633
9635 StringRef VarName, Constant *Addr, int64_t VarSize,
9637 if (OMPBuilder->Config.isTargetDevice()) {
9638 // This could happen if the device compilation is invoked standalone.
9639 if (!hasDeviceGlobalVarEntryInfo(VarName))
9640 return;
9641 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9642 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9643 if (Entry.getVarSize() == 0) {
9644 Entry.setVarSize(VarSize);
9645 Entry.setLinkage(Linkage);
9646 }
9647 return;
9648 }
9649 Entry.setVarSize(VarSize);
9650 Entry.setLinkage(Linkage);
9651 Entry.setAddress(Addr);
9652 } else {
9653 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9654 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9655 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9656 "Entry not initialized!");
9657 if (Entry.getVarSize() == 0) {
9658 Entry.setVarSize(VarSize);
9659 Entry.setLinkage(Linkage);
9660 }
9661 return;
9662 }
9664 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9665 Addr, VarSize, Flags, Linkage,
9666 VarName.str());
9667 else
9668 OffloadEntriesDeviceGlobalVar.try_emplace(
9669 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9670 ++OffloadingEntriesNum;
9671 }
9672}
9673
9676 // Scan all target region entries and perform the provided action.
9677 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9678 Action(E.getKey(), E.getValue());
9679}
9680
9681//===----------------------------------------------------------------------===//
9682// CanonicalLoopInfo
9683//===----------------------------------------------------------------------===//
9684
9685void CanonicalLoopInfo::collectControlBlocks(
9687 // We only count those BBs as control block for which we do not need to
9688 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9689 // flow. For consistency, this also means we do not add the Body block, which
9690 // is just the entry to the body code.
9691 BBs.reserve(BBs.size() + 6);
9692 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9693}
9694
9696 assert(isValid() && "Requires a valid canonical loop");
9697 for (BasicBlock *Pred : predecessors(Header)) {
9698 if (Pred != Latch)
9699 return Pred;
9700 }
9701 llvm_unreachable("Missing preheader");
9702}
9703
9704void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9705 assert(isValid() && "Requires a valid canonical loop");
9706
9707 Instruction *CmpI = &getCond()->front();
9708 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9709 CmpI->setOperand(1, TripCount);
9710
9711#ifndef NDEBUG
9712 assertOK();
9713#endif
9714}
9715
9716void CanonicalLoopInfo::mapIndVar(
9717 llvm::function_ref<Value *(Instruction *)> Updater) {
9718 assert(isValid() && "Requires a valid canonical loop");
9719
9720 Instruction *OldIV = getIndVar();
9721
9722 // Record all uses excluding those introduced by the updater. Uses by the
9723 // CanonicalLoopInfo itself to keep track of the number of iterations are
9724 // excluded.
9725 SmallVector<Use *> ReplacableUses;
9726 for (Use &U : OldIV->uses()) {
9727 auto *User = dyn_cast<Instruction>(U.getUser());
9728 if (!User)
9729 continue;
9730 if (User->getParent() == getCond())
9731 continue;
9732 if (User->getParent() == getLatch())
9733 continue;
9734 ReplacableUses.push_back(&U);
9735 }
9736
9737 // Run the updater that may introduce new uses
9738 Value *NewIV = Updater(OldIV);
9739
9740 // Replace the old uses with the value returned by the updater.
9741 for (Use *U : ReplacableUses)
9742 U->set(NewIV);
9743
9744#ifndef NDEBUG
9745 assertOK();
9746#endif
9747}
9748
9750#ifndef NDEBUG
9751 // No constraints if this object currently does not describe a loop.
9752 if (!isValid())
9753 return;
9754
9755 BasicBlock *Preheader = getPreheader();
9756 BasicBlock *Body = getBody();
9757 BasicBlock *After = getAfter();
9758
9759 // Verify standard control-flow we use for OpenMP loops.
9760 assert(Preheader);
9761 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9762 "Preheader must terminate with unconditional branch");
9763 assert(Preheader->getSingleSuccessor() == Header &&
9764 "Preheader must jump to header");
9765
9766 assert(Header);
9767 assert(isa<BranchInst>(Header->getTerminator()) &&
9768 "Header must terminate with unconditional branch");
9769 assert(Header->getSingleSuccessor() == Cond &&
9770 "Header must jump to exiting block");
9771
9772 assert(Cond);
9773 assert(Cond->getSinglePredecessor() == Header &&
9774 "Exiting block only reachable from header");
9775
9776 assert(isa<BranchInst>(Cond->getTerminator()) &&
9777 "Exiting block must terminate with conditional branch");
9778 assert(size(successors(Cond)) == 2 &&
9779 "Exiting block must have two successors");
9780 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9781 "Exiting block's first successor jump to the body");
9782 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9783 "Exiting block's second successor must exit the loop");
9784
9785 assert(Body);
9786 assert(Body->getSinglePredecessor() == Cond &&
9787 "Body only reachable from exiting block");
9788 assert(!isa<PHINode>(Body->front()));
9789
9790 assert(Latch);
9791 assert(isa<BranchInst>(Latch->getTerminator()) &&
9792 "Latch must terminate with unconditional branch");
9793 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9794 // TODO: To support simple redirecting of the end of the body code that has
9795 // multiple; introduce another auxiliary basic block like preheader and after.
9796 assert(Latch->getSinglePredecessor() != nullptr);
9797 assert(!isa<PHINode>(Latch->front()));
9798
9799 assert(Exit);
9800 assert(isa<BranchInst>(Exit->getTerminator()) &&
9801 "Exit block must terminate with unconditional branch");
9802 assert(Exit->getSingleSuccessor() == After &&
9803 "Exit block must jump to after block");
9804
9805 assert(After);
9806 assert(After->getSinglePredecessor() == Exit &&
9807 "After block only reachable from exit block");
9808 assert(After->empty() || !isa<PHINode>(After->front()));
9809
9810 Instruction *IndVar = getIndVar();
9811 assert(IndVar && "Canonical induction variable not found?");
9812 assert(isa<IntegerType>(IndVar->getType()) &&
9813 "Induction variable must be an integer");
9814 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9815 "Induction variable must be a PHI in the loop header");
9816 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9817 assert(
9818 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9819 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9820
9821 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9822 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9823 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9824 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9825 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9826 ->isOne());
9827
9828 Value *TripCount = getTripCount();
9829 assert(TripCount && "Loop trip count not found?");
9830 assert(IndVar->getType() == TripCount->getType() &&
9831 "Trip count and induction variable must have the same type");
9832
9833 auto *CmpI = cast<CmpInst>(&Cond->front());
9834 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9835 "Exit condition must be a signed less-than comparison");
9836 assert(CmpI->getOperand(0) == IndVar &&
9837 "Exit condition must compare the induction variable");
9838 assert(CmpI->getOperand(1) == TripCount &&
9839 "Exit condition must compare with the trip count");
9840#endif
9841}
9842
9844 Header = nullptr;
9845 Cond = nullptr;
9846 Latch = nullptr;
9847 Exit = nullptr;
9848}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:533
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:595
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:920
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:905
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1924
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1269
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1275
unsigned arg_size() const
Definition: InstrTypes.h:1292
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2990
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:809
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:853
arg_iterator arg_begin()
Definition: Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:356
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:754
size_t arg_size() const
Definition: Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:254
BasicBlock * getBlock() const
Definition: IRBuilder.h:269
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:267
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:270
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1397
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1064
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2289
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1864
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1902
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1796
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2554
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2297
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1830
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2066
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1286
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2201
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2547
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1254
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1048
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1995
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:578
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2060
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2150
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1359
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1897
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2213
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1401
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2277
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1363
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:518
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1889
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1744
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:274
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2398
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1167
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1367
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:494
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1144
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1813
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1439
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2048
LLVMContext & getContext() const
Definition: IRBuilder.h:173
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1498
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1114
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1936
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1982
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1826
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1350
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2145
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1410
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2580
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2444
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1877
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2034
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1520
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:566
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1138
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2305
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:478
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2227
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:286
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2575
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:561
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1849
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1479
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1542
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2383
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1427
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:655
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2081
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2160
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1384
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:951
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1551
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1543
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1731
iterator_range< op_iterator > operands()
Definition: Metadata.h:1827
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr)
Generator for #omp task
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:970
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1028
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1038
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:128
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:144
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:77
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61