LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/LLVMContext.h"
41#include "llvm/IR/MDBuilder.h"
42#include "llvm/IR/Metadata.h"
44#include "llvm/IR/PassManager.h"
46#include "llvm/IR/Value.h"
58
59#include <cstdint>
60#include <optional>
61#include <stack>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 uint32_t SrcLocStrSize,
835 IdentFlag LocFlags,
836 unsigned Reserve2Flags) {
837 // Enable "C-mode".
838 LocFlags |= OMP_IDENT_FLAG_KMPC;
839
840 Constant *&Ident =
841 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
842 if (!Ident) {
844 Constant *IdentData[] = {I32Null,
845 ConstantInt::get(Int32, uint32_t(LocFlags)),
846 ConstantInt::get(Int32, Reserve2Flags),
847 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
848 Constant *Initializer =
849 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
850
851 // Look for existing encoding of the location + flags, not needed but
852 // minimizes the difference to the existing solution while we transition.
853 for (GlobalVariable &GV : M.globals())
854 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
855 if (GV.getInitializer() == Initializer)
856 Ident = &GV;
857
858 if (!Ident) {
859 auto *GV = new GlobalVariable(
860 M, OpenMPIRBuilder::Ident,
861 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
864 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
865 GV->setAlignment(Align(8));
866 Ident = GV;
867 }
868 }
869
871}
872
874 uint32_t &SrcLocStrSize) {
875 SrcLocStrSize = LocStr.size();
876 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
877 if (!SrcLocStr) {
878 Constant *Initializer =
880
881 // Look for existing encoding of the location, not needed but minimizes the
882 // difference to the existing solution while we transition.
883 for (GlobalVariable &GV : M.globals())
884 if (GV.isConstant() && GV.hasInitializer() &&
885 GV.getInitializer() == Initializer)
886 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
887
888 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
889 /* AddressSpace */ 0, &M);
890 }
891 return SrcLocStr;
892}
893
895 StringRef FileName,
896 unsigned Line, unsigned Column,
897 uint32_t &SrcLocStrSize) {
898 SmallString<128> Buffer;
899 Buffer.push_back(';');
900 Buffer.append(FileName);
901 Buffer.push_back(';');
902 Buffer.append(FunctionName);
903 Buffer.push_back(';');
904 Buffer.append(std::to_string(Line));
905 Buffer.push_back(';');
906 Buffer.append(std::to_string(Column));
907 Buffer.push_back(';');
908 Buffer.push_back(';');
909 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
910}
911
912Constant *
914 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
915 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
916}
917
919 uint32_t &SrcLocStrSize,
920 Function *F) {
921 DILocation *DIL = DL.get();
922 if (!DIL)
923 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
924 StringRef FileName = M.getName();
925 if (DIFile *DIF = DIL->getFile())
926 if (std::optional<StringRef> Source = DIF->getSource())
927 FileName = *Source;
928 StringRef Function = DIL->getScope()->getSubprogram()->getName();
929 if (Function.empty() && F)
930 Function = F->getName();
931 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
932 DIL->getColumn(), SrcLocStrSize);
933}
934
936 uint32_t &SrcLocStrSize) {
937 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
938 Loc.IP.getBlock()->getParent());
939}
940
942 return Builder.CreateCall(
943 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
944 "omp_global_thread_num");
945}
946
949 bool ForceSimpleCall, bool CheckCancelFlag) {
950 if (!updateToLocation(Loc))
951 return Loc.IP;
952
953 // Build call __kmpc_cancel_barrier(loc, thread_id) or
954 // __kmpc_barrier(loc, thread_id);
955
956 IdentFlag BarrierLocFlags;
957 switch (Kind) {
958 case OMPD_for:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
960 break;
961 case OMPD_sections:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
963 break;
964 case OMPD_single:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
966 break;
967 case OMPD_barrier:
968 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
969 break;
970 default:
971 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
972 break;
973 }
974
975 uint32_t SrcLocStrSize;
976 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
977 Value *Args[] = {
978 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
979 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
980
981 // If we are in a cancellable parallel region, barriers are cancellation
982 // points.
983 // TODO: Check why we would force simple calls or to ignore the cancel flag.
984 bool UseCancelBarrier =
985 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
986
987 Value *Result =
989 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
990 : OMPRTL___kmpc_barrier),
991 Args);
992
993 if (UseCancelBarrier && CheckCancelFlag)
994 emitCancelationCheckImpl(Result, OMPD_parallel);
995
996 return Builder.saveIP();
997}
998
1001 Value *IfCondition,
1002 omp::Directive CanceledDirective) {
1003 if (!updateToLocation(Loc))
1004 return Loc.IP;
1005
1006 // LLVM utilities like blocks with terminators.
1007 auto *UI = Builder.CreateUnreachable();
1008
1009 Instruction *ThenTI = UI, *ElseTI = nullptr;
1010 if (IfCondition)
1011 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1012 Builder.SetInsertPoint(ThenTI);
1013
1014 Value *CancelKind = nullptr;
1015 switch (CanceledDirective) {
1016#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1017 case DirectiveEnum: \
1018 CancelKind = Builder.getInt32(Value); \
1019 break;
1020#include "llvm/Frontend/OpenMP/OMPKinds.def"
1021 default:
1022 llvm_unreachable("Unknown cancel kind!");
1023 }
1024
1025 uint32_t SrcLocStrSize;
1026 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1027 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1028 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1029 Value *Result = Builder.CreateCall(
1030 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1031 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1032 if (CanceledDirective == OMPD_parallel) {
1034 Builder.restoreIP(IP);
1036 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1037 /* CheckCancelFlag */ false);
1038 }
1039 };
1040
1041 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1042 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1043
1044 // Update the insertion point and remove the terminator we introduced.
1045 Builder.SetInsertPoint(UI->getParent());
1046 UI->eraseFromParent();
1047
1048 return Builder.saveIP();
1049}
1050
1052 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1053 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1054 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1055 if (!updateToLocation(Loc))
1056 return Loc.IP;
1057
1058 Builder.restoreIP(AllocaIP);
1059 auto *KernelArgsPtr =
1060 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1061 Builder.restoreIP(Loc.IP);
1062
1063 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1064 llvm::Value *Arg =
1065 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1067 KernelArgs[I], Arg,
1068 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1069 }
1070
1071 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1072 NumThreads, HostPtr, KernelArgsPtr};
1073
1074 Return = Builder.CreateCall(
1075 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1076 OffloadingArgs);
1077
1078 return Builder.saveIP();
1079}
1080
1082 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1083 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1084 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1085
1086 if (!updateToLocation(Loc))
1087 return Loc.IP;
1088
1089 Builder.restoreIP(Loc.IP);
1090 // On top of the arrays that were filled up, the target offloading call
1091 // takes as arguments the device id as well as the host pointer. The host
1092 // pointer is used by the runtime library to identify the current target
1093 // region, so it only has to be unique and not necessarily point to
1094 // anything. It could be the pointer to the outlined function that
1095 // implements the target region, but we aren't using that so that the
1096 // compiler doesn't need to keep that, and could therefore inline the host
1097 // function if proven worthwhile during optimization.
1098
1099 // From this point on, we need to have an ID of the target region defined.
1100 assert(OutlinedFnID && "Invalid outlined function ID!");
1101 (void)OutlinedFnID;
1102
1103 // Return value of the runtime offloading call.
1104 Value *Return = nullptr;
1105
1106 // Arguments for the target kernel.
1107 SmallVector<Value *> ArgsVector;
1108 getKernelArgsVector(Args, Builder, ArgsVector);
1109
1110 // The target region is an outlined function launched by the runtime
1111 // via calls to __tgt_target_kernel().
1112 //
1113 // Note that on the host and CPU targets, the runtime implementation of
1114 // these calls simply call the outlined function without forking threads.
1115 // The outlined functions themselves have runtime calls to
1116 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1117 // the compiler in emitTeamsCall() and emitParallelCall().
1118 //
1119 // In contrast, on the NVPTX target, the implementation of
1120 // __tgt_target_teams() launches a GPU kernel with the requested number
1121 // of teams and threads so no additional calls to the runtime are required.
1122 // Check the error code and execute the host version if required.
1124 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1125 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1126
1127 BasicBlock *OffloadFailedBlock =
1128 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1129 BasicBlock *OffloadContBlock =
1130 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1132 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1133
1134 auto CurFn = Builder.GetInsertBlock()->getParent();
1135 emitBlock(OffloadFailedBlock, CurFn);
1136 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1137 emitBranch(OffloadContBlock);
1138 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1139 return Builder.saveIP();
1140}
1141
1143 omp::Directive CanceledDirective,
1144 FinalizeCallbackTy ExitCB) {
1145 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1146 "Unexpected cancellation!");
1147
1148 // For a cancel barrier we create two new blocks.
1150 BasicBlock *NonCancellationBlock;
1151 if (Builder.GetInsertPoint() == BB->end()) {
1152 // TODO: This branch will not be needed once we moved to the
1153 // OpenMPIRBuilder codegen completely.
1154 NonCancellationBlock = BasicBlock::Create(
1155 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1156 } else {
1157 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1160 }
1161 BasicBlock *CancellationBlock = BasicBlock::Create(
1162 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1163
1164 // Jump to them based on the return value.
1165 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1166 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1167 /* TODO weight */ nullptr, nullptr);
1168
1169 // From the cancellation block we finalize all variables and go to the
1170 // post finalization block that is known to the FiniCB callback.
1171 Builder.SetInsertPoint(CancellationBlock);
1172 if (ExitCB)
1173 ExitCB(Builder.saveIP());
1174 auto &FI = FinalizationStack.back();
1175 FI.FiniCB(Builder.saveIP());
1176
1177 // The continuation block is where code generation continues.
1178 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1179}
1180
1181// Callback used to create OpenMP runtime calls to support
1182// omp parallel clause for the device.
1183// We need to use this callback to replace call to the OutlinedFn in OuterFn
1184// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1186 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1187 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1188 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1189 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1190 // Add some known attributes.
1191 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1192 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1193 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1194 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1195 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1196 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1197
1198 assert(OutlinedFn.arg_size() >= 2 &&
1199 "Expected at least tid and bounded tid as arguments");
1200 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1201
1202 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1203 assert(CI && "Expected call instruction to outlined function");
1204 CI->getParent()->setName("omp_parallel");
1205
1206 Builder.SetInsertPoint(CI);
1207 Type *PtrTy = OMPIRBuilder->VoidPtr;
1208 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1209
1210 // Add alloca for kernel args
1211 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1212 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1213 AllocaInst *ArgsAlloca =
1214 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1215 Value *Args = ArgsAlloca;
1216 // Add address space cast if array for storing arguments is not allocated
1217 // in address space 0
1218 if (ArgsAlloca->getAddressSpace())
1219 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1220 Builder.restoreIP(CurrentIP);
1221
1222 // Store captured vars which are used by kmpc_parallel_51
1223 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1224 Value *V = *(CI->arg_begin() + 2 + Idx);
1225 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1226 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1227 Builder.CreateStore(V, StoreAddress);
1228 }
1229
1230 Value *Cond =
1231 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1232 : Builder.getInt32(1);
1233
1234 // Build kmpc_parallel_51 call
1235 Value *Parallel51CallArgs[] = {
1236 /* identifier*/ Ident,
1237 /* global thread num*/ ThreadID,
1238 /* if expression */ Cond,
1239 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1240 /* Proc bind */ Builder.getInt32(-1),
1241 /* outlined function */
1242 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1243 /* wrapper function */ NullPtrValue,
1244 /* arguments of the outlined funciton*/ Args,
1245 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1246
1247 FunctionCallee RTLFn =
1248 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1249
1250 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1251
1252 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1253 << *Builder.GetInsertBlock()->getParent() << "\n");
1254
1255 // Initialize the local TID stack location with the argument value.
1256 Builder.SetInsertPoint(PrivTID);
1257 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1258 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1259 PrivTIDAddr);
1260
1261 // Remove redundant call to the outlined function.
1262 CI->eraseFromParent();
1263
1264 for (Instruction *I : ToBeDeleted) {
1265 I->eraseFromParent();
1266 }
1267}
1268
1269// Callback used to create OpenMP runtime calls to support
1270// omp parallel clause for the host.
1271// We need to use this callback to replace call to the OutlinedFn in OuterFn
1272// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1273static void
1275 Function *OuterFn, Value *Ident, Value *IfCondition,
1276 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1277 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1278 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1279 FunctionCallee RTLFn;
1280 if (IfCondition) {
1281 RTLFn =
1282 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1283 } else {
1284 RTLFn =
1285 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1286 }
1287 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1288 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1289 LLVMContext &Ctx = F->getContext();
1290 MDBuilder MDB(Ctx);
1291 // Annotate the callback behavior of the __kmpc_fork_call:
1292 // - The callback callee is argument number 2 (microtask).
1293 // - The first two arguments of the callback callee are unknown (-1).
1294 // - All variadic arguments to the __kmpc_fork_call are passed to the
1295 // callback callee.
1296 F->addMetadata(LLVMContext::MD_callback,
1298 2, {-1, -1},
1299 /* VarArgsArePassed */ true)}));
1300 }
1301 }
1302 // Add some known attributes.
1303 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1304 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1305 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1306
1307 assert(OutlinedFn.arg_size() >= 2 &&
1308 "Expected at least tid and bounded tid as arguments");
1309 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1310
1311 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1312 CI->getParent()->setName("omp_parallel");
1313 Builder.SetInsertPoint(CI);
1314
1315 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1316 Value *ForkCallArgs[] = {
1317 Ident, Builder.getInt32(NumCapturedVars),
1318 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1319
1320 SmallVector<Value *, 16> RealArgs;
1321 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1322 if (IfCondition) {
1323 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1324 RealArgs.push_back(Cond);
1325 }
1326 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1327
1328 // __kmpc_fork_call_if always expects a void ptr as the last argument
1329 // If there are no arguments, pass a null pointer.
1330 auto PtrTy = OMPIRBuilder->VoidPtr;
1331 if (IfCondition && NumCapturedVars == 0) {
1332 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1333 RealArgs.push_back(NullPtrValue);
1334 }
1335 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1336 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1337
1338 Builder.CreateCall(RTLFn, RealArgs);
1339
1340 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1341 << *Builder.GetInsertBlock()->getParent() << "\n");
1342
1343 // Initialize the local TID stack location with the argument value.
1344 Builder.SetInsertPoint(PrivTID);
1345 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1346 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1347 PrivTIDAddr);
1348
1349 // Remove redundant call to the outlined function.
1350 CI->eraseFromParent();
1351
1352 for (Instruction *I : ToBeDeleted) {
1353 I->eraseFromParent();
1354 }
1355}
1356
1358 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1359 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1360 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1361 omp::ProcBindKind ProcBind, bool IsCancellable) {
1362 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1363
1364 if (!updateToLocation(Loc))
1365 return Loc.IP;
1366
1367 uint32_t SrcLocStrSize;
1368 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1369 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1370 Value *ThreadID = getOrCreateThreadID(Ident);
1371 // If we generate code for the target device, we need to allocate
1372 // struct for aggregate params in the device default alloca address space.
1373 // OpenMP runtime requires that the params of the extracted functions are
1374 // passed as zero address space pointers. This flag ensures that extracted
1375 // function arguments are declared in zero address space
1376 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1377
1378 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1379 // only if we compile for host side.
1380 if (NumThreads && !Config.isTargetDevice()) {
1381 Value *Args[] = {
1382 Ident, ThreadID,
1383 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1385 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1386 }
1387
1388 if (ProcBind != OMP_PROC_BIND_default) {
1389 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1390 Value *Args[] = {
1391 Ident, ThreadID,
1392 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1394 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1395 }
1396
1397 BasicBlock *InsertBB = Builder.GetInsertBlock();
1398 Function *OuterFn = InsertBB->getParent();
1399
1400 // Save the outer alloca block because the insertion iterator may get
1401 // invalidated and we still need this later.
1402 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1403
1404 // Vector to remember instructions we used only during the modeling but which
1405 // we want to delete at the end.
1407
1408 // Change the location to the outer alloca insertion point to create and
1409 // initialize the allocas we pass into the parallel region.
1410 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1411 Builder.restoreIP(NewOuter);
1412 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1413 AllocaInst *ZeroAddrAlloca =
1414 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1415 Instruction *TIDAddr = TIDAddrAlloca;
1416 Instruction *ZeroAddr = ZeroAddrAlloca;
1417 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1418 // Add additional casts to enforce pointers in zero address space
1419 TIDAddr = new AddrSpaceCastInst(
1420 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1421 TIDAddr->insertAfter(TIDAddrAlloca);
1422 ToBeDeleted.push_back(TIDAddr);
1423 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1424 PointerType ::get(M.getContext(), 0),
1425 "zero.addr.ascast");
1426 ZeroAddr->insertAfter(ZeroAddrAlloca);
1427 ToBeDeleted.push_back(ZeroAddr);
1428 }
1429
1430 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1431 // associated arguments in the outlined function, so we delete them later.
1432 ToBeDeleted.push_back(TIDAddrAlloca);
1433 ToBeDeleted.push_back(ZeroAddrAlloca);
1434
1435 // Create an artificial insertion point that will also ensure the blocks we
1436 // are about to split are not degenerated.
1437 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1438
1439 BasicBlock *EntryBB = UI->getParent();
1440 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1441 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1442 BasicBlock *PRegPreFiniBB =
1443 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1444 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1445
1446 auto FiniCBWrapper = [&](InsertPointTy IP) {
1447 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1448 // target to the region exit block.
1449 if (IP.getBlock()->end() == IP.getPoint()) {
1451 Builder.restoreIP(IP);
1452 Instruction *I = Builder.CreateBr(PRegExitBB);
1453 IP = InsertPointTy(I->getParent(), I->getIterator());
1454 }
1455 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1456 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1457 "Unexpected insertion point for finalization call!");
1458 return FiniCB(IP);
1459 };
1460
1461 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1462
1463 // Generate the privatization allocas in the block that will become the entry
1464 // of the outlined function.
1465 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1466 InsertPointTy InnerAllocaIP = Builder.saveIP();
1467
1468 AllocaInst *PrivTIDAddr =
1469 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1470 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1471
1472 // Add some fake uses for OpenMP provided arguments.
1473 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1474 Instruction *ZeroAddrUse =
1475 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1476 ToBeDeleted.push_back(ZeroAddrUse);
1477
1478 // EntryBB
1479 // |
1480 // V
1481 // PRegionEntryBB <- Privatization allocas are placed here.
1482 // |
1483 // V
1484 // PRegionBodyBB <- BodeGen is invoked here.
1485 // |
1486 // V
1487 // PRegPreFiniBB <- The block we will start finalization from.
1488 // |
1489 // V
1490 // PRegionExitBB <- A common exit to simplify block collection.
1491 //
1492
1493 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1494
1495 // Let the caller create the body.
1496 assert(BodyGenCB && "Expected body generation callback!");
1497 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1498 BodyGenCB(InnerAllocaIP, CodeGenIP);
1499
1500 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1501
1502 OutlineInfo OI;
1503 if (Config.isTargetDevice()) {
1504 // Generate OpenMP target specific runtime call
1505 OI.PostOutlineCB = [=, ToBeDeletedVec =
1506 std::move(ToBeDeleted)](Function &OutlinedFn) {
1507 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1508 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1509 ThreadID, ToBeDeletedVec);
1510 };
1511 } else {
1512 // Generate OpenMP host runtime call
1513 OI.PostOutlineCB = [=, ToBeDeletedVec =
1514 std::move(ToBeDeleted)](Function &OutlinedFn) {
1515 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1516 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1517 };
1518 }
1519
1520 OI.OuterAllocaBB = OuterAllocaBlock;
1521 OI.EntryBB = PRegEntryBB;
1522 OI.ExitBB = PRegExitBB;
1523
1524 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1526 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1527
1528 // Ensure a single exit node for the outlined region by creating one.
1529 // We might have multiple incoming edges to the exit now due to finalizations,
1530 // e.g., cancel calls that cause the control flow to leave the region.
1531 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1532 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1533 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1534 Blocks.push_back(PRegOutlinedExitBB);
1535
1536 CodeExtractorAnalysisCache CEAC(*OuterFn);
1537 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1538 /* AggregateArgs */ false,
1539 /* BlockFrequencyInfo */ nullptr,
1540 /* BranchProbabilityInfo */ nullptr,
1541 /* AssumptionCache */ nullptr,
1542 /* AllowVarArgs */ true,
1543 /* AllowAlloca */ true,
1544 /* AllocationBlock */ OuterAllocaBlock,
1545 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1546
1547 // Find inputs to, outputs from the code region.
1548 BasicBlock *CommonExit = nullptr;
1549 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1550 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1551 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1552
1553 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1554
1555 FunctionCallee TIDRTLFn =
1556 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1557
1558 auto PrivHelper = [&](Value &V) {
1559 if (&V == TIDAddr || &V == ZeroAddr) {
1560 OI.ExcludeArgsFromAggregate.push_back(&V);
1561 return;
1562 }
1563
1565 for (Use &U : V.uses())
1566 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1567 if (ParallelRegionBlockSet.count(UserI->getParent()))
1568 Uses.insert(&U);
1569
1570 // __kmpc_fork_call expects extra arguments as pointers. If the input
1571 // already has a pointer type, everything is fine. Otherwise, store the
1572 // value onto stack and load it back inside the to-be-outlined region. This
1573 // will ensure only the pointer will be passed to the function.
1574 // FIXME: if there are more than 15 trailing arguments, they must be
1575 // additionally packed in a struct.
1576 Value *Inner = &V;
1577 if (!V.getType()->isPointerTy()) {
1579 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1580
1581 Builder.restoreIP(OuterAllocaIP);
1582 Value *Ptr =
1583 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1584
1585 // Store to stack at end of the block that currently branches to the entry
1586 // block of the to-be-outlined region.
1587 Builder.SetInsertPoint(InsertBB,
1588 InsertBB->getTerminator()->getIterator());
1589 Builder.CreateStore(&V, Ptr);
1590
1591 // Load back next to allocations in the to-be-outlined region.
1592 Builder.restoreIP(InnerAllocaIP);
1593 Inner = Builder.CreateLoad(V.getType(), Ptr);
1594 }
1595
1596 Value *ReplacementValue = nullptr;
1597 CallInst *CI = dyn_cast<CallInst>(&V);
1598 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1599 ReplacementValue = PrivTID;
1600 } else {
1602 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1603 InnerAllocaIP = {
1604 InnerAllocaIP.getBlock(),
1605 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1606
1607 assert(ReplacementValue &&
1608 "Expected copy/create callback to set replacement value!");
1609 if (ReplacementValue == &V)
1610 return;
1611 }
1612
1613 for (Use *UPtr : Uses)
1614 UPtr->set(ReplacementValue);
1615 };
1616
1617 // Reset the inner alloca insertion as it will be used for loading the values
1618 // wrapped into pointers before passing them into the to-be-outlined region.
1619 // Configure it to insert immediately after the fake use of zero address so
1620 // that they are available in the generated body and so that the
1621 // OpenMP-related values (thread ID and zero address pointers) remain leading
1622 // in the argument list.
1623 InnerAllocaIP = IRBuilder<>::InsertPoint(
1624 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1625
1626 // Reset the outer alloca insertion point to the entry of the relevant block
1627 // in case it was invalidated.
1628 OuterAllocaIP = IRBuilder<>::InsertPoint(
1629 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1630
1631 for (Value *Input : Inputs) {
1632 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1633 PrivHelper(*Input);
1634 }
1635 LLVM_DEBUG({
1636 for (Value *Output : Outputs)
1637 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1638 });
1639 assert(Outputs.empty() &&
1640 "OpenMP outlining should not produce live-out values!");
1641
1642 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1643 LLVM_DEBUG({
1644 for (auto *BB : Blocks)
1645 dbgs() << " PBR: " << BB->getName() << "\n";
1646 });
1647
1648 // Adjust the finalization stack, verify the adjustment, and call the
1649 // finalize function a last time to finalize values between the pre-fini
1650 // block and the exit block if we left the parallel "the normal way".
1651 auto FiniInfo = FinalizationStack.pop_back_val();
1652 (void)FiniInfo;
1653 assert(FiniInfo.DK == OMPD_parallel &&
1654 "Unexpected finalization stack state!");
1655
1656 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1657
1658 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1659 FiniCB(PreFiniIP);
1660
1661 // Register the outlined info.
1662 addOutlineInfo(std::move(OI));
1663
1664 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1665 UI->eraseFromParent();
1666
1667 return AfterIP;
1668}
1669
1671 // Build call void __kmpc_flush(ident_t *loc)
1672 uint32_t SrcLocStrSize;
1673 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1674 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1675
1676 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1677}
1678
1680 if (!updateToLocation(Loc))
1681 return;
1682 emitFlush(Loc);
1683}
1684
1686 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1687 // global_tid);
1688 uint32_t SrcLocStrSize;
1689 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1690 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1691 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1692
1693 // Ignore return result until untied tasks are supported.
1694 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1695 Args);
1696}
1697
1699 if (!updateToLocation(Loc))
1700 return;
1701 emitTaskwaitImpl(Loc);
1702}
1703
1705 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1706 uint32_t SrcLocStrSize;
1707 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1708 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1710 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1711
1712 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1713 Args);
1714}
1715
1717 if (!updateToLocation(Loc))
1718 return;
1719 emitTaskyieldImpl(Loc);
1720}
1721
1722// Processes the dependencies in Dependencies and does the following
1723// - Allocates space on the stack of an array of DependInfo objects
1724// - Populates each DependInfo object with relevant information of
1725// the corresponding dependence.
1726// - All code is inserted in the entry block of the current function.
1728 OpenMPIRBuilder &OMPBuilder,
1730 // Early return if we have no dependencies to process
1731 if (Dependencies.empty())
1732 return nullptr;
1733
1734 // Given a vector of DependData objects, in this function we create an
1735 // array on the stack that holds kmp_dep_info objects corresponding
1736 // to each dependency. This is then passed to the OpenMP runtime.
1737 // For example, if there are 'n' dependencies then the following psedo
1738 // code is generated. Assume the first dependence is on a variable 'a'
1739 //
1740 // \code{c}
1741 // DepArray = alloc(n x sizeof(kmp_depend_info);
1742 // idx = 0;
1743 // DepArray[idx].base_addr = ptrtoint(&a);
1744 // DepArray[idx].len = 8;
1745 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1746 // ++idx;
1747 // DepArray[idx].base_addr = ...;
1748 // \endcode
1749
1750 IRBuilderBase &Builder = OMPBuilder.Builder;
1751 Type *DependInfo = OMPBuilder.DependInfo;
1752 Module &M = OMPBuilder.M;
1753
1754 Value *DepArray = nullptr;
1755 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1756 Builder.SetInsertPoint(
1758
1759 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1760 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1761
1762 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1763 Value *Base =
1764 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1765 // Store the pointer to the variable
1766 Value *Addr = Builder.CreateStructGEP(
1767 DependInfo, Base,
1768 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1769 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1770 Builder.CreateStore(DepValPtr, Addr);
1771 // Store the size of the variable
1772 Value *Size = Builder.CreateStructGEP(
1773 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1774 Builder.CreateStore(
1775 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1776 Size);
1777 // Store the dependency kind
1778 Value *Flags = Builder.CreateStructGEP(
1779 DependInfo, Base,
1780 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1781 Builder.CreateStore(
1782 ConstantInt::get(Builder.getInt8Ty(),
1783 static_cast<unsigned int>(Dep.DepKind)),
1784 Flags);
1785 }
1786 Builder.restoreIP(OldIP);
1787 return DepArray;
1788}
1789
1792 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1793 bool Tied, Value *Final, Value *IfCondition,
1794 SmallVector<DependData> Dependencies) {
1795
1796 if (!updateToLocation(Loc))
1797 return InsertPointTy();
1798
1799 uint32_t SrcLocStrSize;
1800 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1801 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1802 // The current basic block is split into four basic blocks. After outlining,
1803 // they will be mapped as follows:
1804 // ```
1805 // def current_fn() {
1806 // current_basic_block:
1807 // br label %task.exit
1808 // task.exit:
1809 // ; instructions after task
1810 // }
1811 // def outlined_fn() {
1812 // task.alloca:
1813 // br label %task.body
1814 // task.body:
1815 // ret void
1816 // }
1817 // ```
1818 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1819 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1820 BasicBlock *TaskAllocaBB =
1821 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1822
1823 InsertPointTy TaskAllocaIP =
1824 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1825 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1826 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1827
1828 OutlineInfo OI;
1829 OI.EntryBB = TaskAllocaBB;
1830 OI.OuterAllocaBB = AllocaIP.getBlock();
1831 OI.ExitBB = TaskExitBB;
1832
1833 // Add the thread ID argument.
1836 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1837
1838 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1839 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1840 // Replace the Stale CI by appropriate RTL function call.
1841 assert(OutlinedFn.getNumUses() == 1 &&
1842 "there must be a single user for the outlined function");
1843 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1844
1845 // HasShareds is true if any variables are captured in the outlined region,
1846 // false otherwise.
1847 bool HasShareds = StaleCI->arg_size() > 1;
1848 Builder.SetInsertPoint(StaleCI);
1849
1850 // Gather the arguments for emitting the runtime call for
1851 // @__kmpc_omp_task_alloc
1852 Function *TaskAllocFn =
1853 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1854
1855 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1856 // call.
1857 Value *ThreadID = getOrCreateThreadID(Ident);
1858
1859 // Argument - `flags`
1860 // Task is tied iff (Flags & 1) == 1.
1861 // Task is untied iff (Flags & 1) == 0.
1862 // Task is final iff (Flags & 2) == 2.
1863 // Task is not final iff (Flags & 2) == 0.
1864 // TODO: Handle the other flags.
1865 Value *Flags = Builder.getInt32(Tied);
1866 if (Final) {
1867 Value *FinalFlag =
1869 Flags = Builder.CreateOr(FinalFlag, Flags);
1870 }
1871
1872 // Argument - `sizeof_kmp_task_t` (TaskSize)
1873 // Tasksize refers to the size in bytes of kmp_task_t data structure
1874 // including private vars accessed in task.
1875 // TODO: add kmp_task_t_with_privates (privates)
1876 Value *TaskSize = Builder.getInt64(
1878
1879 // Argument - `sizeof_shareds` (SharedsSize)
1880 // SharedsSize refers to the shareds array size in the kmp_task_t data
1881 // structure.
1882 Value *SharedsSize = Builder.getInt64(0);
1883 if (HasShareds) {
1884 AllocaInst *ArgStructAlloca =
1885 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1886 assert(ArgStructAlloca &&
1887 "Unable to find the alloca instruction corresponding to arguments "
1888 "for extracted function");
1889 StructType *ArgStructType =
1890 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1891 assert(ArgStructType && "Unable to find struct type corresponding to "
1892 "arguments for extracted function");
1893 SharedsSize =
1895 }
1896 // Emit the @__kmpc_omp_task_alloc runtime call
1897 // The runtime call returns a pointer to an area where the task captured
1898 // variables must be copied before the task is run (TaskData)
1899 CallInst *TaskData = Builder.CreateCall(
1900 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1901 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1902 /*task_func=*/&OutlinedFn});
1903
1904 // Copy the arguments for outlined function
1905 if (HasShareds) {
1906 Value *Shareds = StaleCI->getArgOperand(1);
1907 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1908 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1909 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1910 SharedsSize);
1911 }
1912
1913 Value *DepArray = nullptr;
1914 if (Dependencies.size()) {
1915 InsertPointTy OldIP = Builder.saveIP();
1917 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1918
1919 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1920 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1921
1922 unsigned P = 0;
1923 for (const DependData &Dep : Dependencies) {
1924 Value *Base =
1925 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1926 // Store the pointer to the variable
1928 DependInfo, Base,
1929 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1930 Value *DepValPtr =
1932 Builder.CreateStore(DepValPtr, Addr);
1933 // Store the size of the variable
1935 DependInfo, Base,
1936 static_cast<unsigned int>(RTLDependInfoFields::Len));
1938 Dep.DepValueType)),
1939 Size);
1940 // Store the dependency kind
1942 DependInfo, Base,
1943 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1945 ConstantInt::get(Builder.getInt8Ty(),
1946 static_cast<unsigned int>(Dep.DepKind)),
1947 Flags);
1948 ++P;
1949 }
1950
1951 Builder.restoreIP(OldIP);
1952 }
1953
1954 // In the presence of the `if` clause, the following IR is generated:
1955 // ...
1956 // %data = call @__kmpc_omp_task_alloc(...)
1957 // br i1 %if_condition, label %then, label %else
1958 // then:
1959 // call @__kmpc_omp_task(...)
1960 // br label %exit
1961 // else:
1962 // ;; Wait for resolution of dependencies, if any, before
1963 // ;; beginning the task
1964 // call @__kmpc_omp_wait_deps(...)
1965 // call @__kmpc_omp_task_begin_if0(...)
1966 // call @outlined_fn(...)
1967 // call @__kmpc_omp_task_complete_if0(...)
1968 // br label %exit
1969 // exit:
1970 // ...
1971 if (IfCondition) {
1972 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1973 // terminator.
1974 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1975 Instruction *IfTerminator =
1976 Builder.GetInsertPoint()->getParent()->getTerminator();
1977 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1978 Builder.SetInsertPoint(IfTerminator);
1979 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1980 &ElseTI);
1981 Builder.SetInsertPoint(ElseTI);
1982
1983 if (Dependencies.size()) {
1984 Function *TaskWaitFn =
1985 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1987 TaskWaitFn,
1988 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1989 ConstantInt::get(Builder.getInt32Ty(), 0),
1991 }
1992 Function *TaskBeginFn =
1993 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1994 Function *TaskCompleteFn =
1995 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1996 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1997 CallInst *CI = nullptr;
1998 if (HasShareds)
1999 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2000 else
2001 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2002 CI->setDebugLoc(StaleCI->getDebugLoc());
2003 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2004 Builder.SetInsertPoint(ThenTI);
2005 }
2006
2007 if (Dependencies.size()) {
2008 Function *TaskFn =
2009 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2011 TaskFn,
2012 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2013 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2015
2016 } else {
2017 // Emit the @__kmpc_omp_task runtime call to spawn the task
2018 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2019 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2020 }
2021
2022 StaleCI->eraseFromParent();
2023
2024 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2025 if (HasShareds) {
2026 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2027 OutlinedFn.getArg(1)->replaceUsesWithIf(
2028 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2029 }
2030
2031 for (Instruction *I : llvm::reverse(ToBeDeleted))
2032 I->eraseFromParent();
2033 };
2034
2035 addOutlineInfo(std::move(OI));
2036 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2037
2038 return Builder.saveIP();
2039}
2040
2043 InsertPointTy AllocaIP,
2044 BodyGenCallbackTy BodyGenCB) {
2045 if (!updateToLocation(Loc))
2046 return InsertPointTy();
2047
2048 uint32_t SrcLocStrSize;
2049 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2050 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2051 Value *ThreadID = getOrCreateThreadID(Ident);
2052
2053 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2054 Function *TaskgroupFn =
2055 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2056 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2057
2058 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2059 BodyGenCB(AllocaIP, Builder.saveIP());
2060
2061 Builder.SetInsertPoint(TaskgroupExitBB);
2062 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2063 Function *EndTaskgroupFn =
2064 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2065 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2066
2067 return Builder.saveIP();
2068}
2069
2071 const LocationDescription &Loc, InsertPointTy AllocaIP,
2073 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2074 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2075
2076 if (!updateToLocation(Loc))
2077 return Loc.IP;
2078
2079 auto FiniCBWrapper = [&](InsertPointTy IP) {
2080 if (IP.getBlock()->end() != IP.getPoint())
2081 return FiniCB(IP);
2082 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2083 // will fail because that function requires the Finalization Basic Block to
2084 // have a terminator, which is already removed by EmitOMPRegionBody.
2085 // IP is currently at cancelation block.
2086 // We need to backtrack to the condition block to fetch
2087 // the exit block and create a branch from cancelation
2088 // to exit block.
2090 Builder.restoreIP(IP);
2091 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2092 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2093 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2094 Instruction *I = Builder.CreateBr(ExitBB);
2095 IP = InsertPointTy(I->getParent(), I->getIterator());
2096 return FiniCB(IP);
2097 };
2098
2099 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2100
2101 // Each section is emitted as a switch case
2102 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2103 // -> OMP.createSection() which generates the IR for each section
2104 // Iterate through all sections and emit a switch construct:
2105 // switch (IV) {
2106 // case 0:
2107 // <SectionStmt[0]>;
2108 // break;
2109 // ...
2110 // case <NumSection> - 1:
2111 // <SectionStmt[<NumSection> - 1]>;
2112 // break;
2113 // }
2114 // ...
2115 // section_loop.after:
2116 // <FiniCB>;
2117 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2118 Builder.restoreIP(CodeGenIP);
2120 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2121 Function *CurFn = Continue->getParent();
2122 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2123
2124 unsigned CaseNumber = 0;
2125 for (auto SectionCB : SectionCBs) {
2127 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2128 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2129 Builder.SetInsertPoint(CaseBB);
2130 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2131 SectionCB(InsertPointTy(),
2132 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2133 CaseNumber++;
2134 }
2135 // remove the existing terminator from body BB since there can be no
2136 // terminators after switch/case
2137 };
2138 // Loop body ends here
2139 // LowerBound, UpperBound, and STride for createCanonicalLoop
2140 Type *I32Ty = Type::getInt32Ty(M.getContext());
2141 Value *LB = ConstantInt::get(I32Ty, 0);
2142 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2143 Value *ST = ConstantInt::get(I32Ty, 1);
2145 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2146 InsertPointTy AfterIP =
2147 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2148
2149 // Apply the finalization callback in LoopAfterBB
2150 auto FiniInfo = FinalizationStack.pop_back_val();
2151 assert(FiniInfo.DK == OMPD_sections &&
2152 "Unexpected finalization stack state!");
2153 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2154 Builder.restoreIP(AfterIP);
2155 BasicBlock *FiniBB =
2156 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2157 CB(Builder.saveIP());
2158 AfterIP = {FiniBB, FiniBB->begin()};
2159 }
2160
2161 return AfterIP;
2162}
2163
2166 BodyGenCallbackTy BodyGenCB,
2167 FinalizeCallbackTy FiniCB) {
2168 if (!updateToLocation(Loc))
2169 return Loc.IP;
2170
2171 auto FiniCBWrapper = [&](InsertPointTy IP) {
2172 if (IP.getBlock()->end() != IP.getPoint())
2173 return FiniCB(IP);
2174 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2175 // will fail because that function requires the Finalization Basic Block to
2176 // have a terminator, which is already removed by EmitOMPRegionBody.
2177 // IP is currently at cancelation block.
2178 // We need to backtrack to the condition block to fetch
2179 // the exit block and create a branch from cancelation
2180 // to exit block.
2182 Builder.restoreIP(IP);
2183 auto *CaseBB = Loc.IP.getBlock();
2184 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2185 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2186 Instruction *I = Builder.CreateBr(ExitBB);
2187 IP = InsertPointTy(I->getParent(), I->getIterator());
2188 return FiniCB(IP);
2189 };
2190
2191 Directive OMPD = Directive::OMPD_sections;
2192 // Since we are using Finalization Callback here, HasFinalize
2193 // and IsCancellable have to be true
2194 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2195 /*Conditional*/ false, /*hasFinalize*/ true,
2196 /*IsCancellable*/ true);
2197}
2198
2201 IT++;
2202 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2203}
2204
2205void OpenMPIRBuilder::emitUsed(StringRef Name,
2206 std::vector<WeakTrackingVH> &List) {
2207 if (List.empty())
2208 return;
2209
2210 // Convert List to what ConstantArray needs.
2212 UsedArray.resize(List.size());
2213 for (unsigned I = 0, E = List.size(); I != E; ++I)
2215 cast<Constant>(&*List[I]), Builder.getPtrTy());
2216
2217 if (UsedArray.empty())
2218 return;
2219 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2220
2221 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2222 ConstantArray::get(ATy, UsedArray), Name);
2223
2224 GV->setSection("llvm.metadata");
2225}
2226
2227Value *OpenMPIRBuilder::getGPUThreadID() {
2228 return Builder.CreateCall(
2230 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2231 {});
2232}
2233
2234Value *OpenMPIRBuilder::getGPUWarpSize() {
2235 return Builder.CreateCall(
2236 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2237}
2238
2239Value *OpenMPIRBuilder::getNVPTXWarpID() {
2240 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2241 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2242}
2243
2244Value *OpenMPIRBuilder::getNVPTXLaneID() {
2245 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2246 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2247 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2248 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2249 "nvptx_lane_id");
2250}
2251
2252Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2253 Type *ToType) {
2254 Type *FromType = From->getType();
2255 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2256 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2257 assert(FromSize > 0 && "From size must be greater than zero");
2258 assert(ToSize > 0 && "To size must be greater than zero");
2259 if (FromType == ToType)
2260 return From;
2261 if (FromSize == ToSize)
2262 return Builder.CreateBitCast(From, ToType);
2263 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2264 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2265 InsertPointTy SaveIP = Builder.saveIP();
2266 Builder.restoreIP(AllocaIP);
2267 Value *CastItem = Builder.CreateAlloca(ToType);
2268 Builder.restoreIP(SaveIP);
2269
2271 CastItem, FromType->getPointerTo());
2272 Builder.CreateStore(From, ValCastItem);
2273 return Builder.CreateLoad(ToType, CastItem);
2274}
2275
2276Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2277 Value *Element,
2278 Type *ElementType,
2279 Value *Offset) {
2280 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2281 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2282
2283 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2284 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2285 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2286 Value *WarpSize =
2287 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2289 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2290 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2291 Value *WarpSizeCast =
2292 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2293 Value *ShuffleCall =
2294 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2295 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2296}
2297
2298void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2299 Value *DstAddr, Type *ElemType,
2300 Value *Offset, Type *ReductionArrayTy) {
2302 // Create the loop over the big sized data.
2303 // ptr = (void*)Elem;
2304 // ptrEnd = (void*) Elem + 1;
2305 // Step = 8;
2306 // while (ptr + Step < ptrEnd)
2307 // shuffle((int64_t)*ptr);
2308 // Step = 4;
2309 // while (ptr + Step < ptrEnd)
2310 // shuffle((int32_t)*ptr);
2311 // ...
2312 Type *IndexTy = Builder.getIndexTy(
2314 Value *ElemPtr = DstAddr;
2315 Value *Ptr = SrcAddr;
2316 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2317 if (Size < IntSize)
2318 continue;
2319 Type *IntType = Builder.getIntNTy(IntSize * 8);
2321 Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
2322 Value *SrcAddrGEP =
2323 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2325 ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
2326
2327 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2328 if ((Size / IntSize) > 1) {
2330 SrcAddrGEP, Builder.getPtrTy());
2331 BasicBlock *PreCondBB =
2332 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2333 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2334 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2335 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2336 emitBlock(PreCondBB, CurFunc);
2337 PHINode *PhiSrc =
2338 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2339 PhiSrc->addIncoming(Ptr, CurrentBB);
2340 PHINode *PhiDest =
2341 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2342 PhiDest->addIncoming(ElemPtr, CurrentBB);
2343 Ptr = PhiSrc;
2344 ElemPtr = PhiDest;
2345 Value *PtrDiff = Builder.CreatePtrDiff(
2346 Builder.getInt8Ty(), PtrEnd,
2349 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2350 ExitBB);
2351 emitBlock(ThenBB, CurFunc);
2352 Value *Res = createRuntimeShuffleFunction(
2353 AllocaIP,
2355 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2356 IntType, Offset);
2357 Builder.CreateAlignedStore(Res, ElemPtr,
2358 M.getDataLayout().getPrefTypeAlign(ElemType));
2359 Value *LocalPtr =
2360 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2361 Value *LocalElemPtr =
2362 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2363 PhiSrc->addIncoming(LocalPtr, ThenBB);
2364 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2365 emitBranch(PreCondBB);
2366 emitBlock(ExitBB, CurFunc);
2367 } else {
2368 Value *Res = createRuntimeShuffleFunction(
2369 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2370 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2371 Res->getType()->getScalarSizeInBits())
2372 Res = Builder.CreateTrunc(Res, ElemType);
2373 Builder.CreateStore(Res, ElemPtr);
2374 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2375 ElemPtr =
2376 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2377 }
2378 Size = Size % IntSize;
2379 }
2380}
2381
2382void OpenMPIRBuilder::emitReductionListCopy(
2383 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2384 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2385 CopyOptionsTy CopyOptions) {
2386 Type *IndexTy = Builder.getIndexTy(
2388 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2389
2390 // Iterates, element-by-element, through the source Reduce list and
2391 // make a copy.
2392 for (auto En : enumerate(ReductionInfos)) {
2393 const ReductionInfo &RI = En.value();
2394 Value *SrcElementAddr = nullptr;
2395 Value *DestElementAddr = nullptr;
2396 Value *DestElementPtrAddr = nullptr;
2397 // Should we shuffle in an element from a remote lane?
2398 bool ShuffleInElement = false;
2399 // Set to true to update the pointer in the dest Reduce list to a
2400 // newly created element.
2401 bool UpdateDestListPtr = false;
2402
2403 // Step 1.1: Get the address for the src element in the Reduce list.
2404 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2405 ReductionArrayTy, SrcBase,
2406 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2407 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2408
2409 // Step 1.2: Create a temporary to store the element in the destination
2410 // Reduce list.
2411 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2412 ReductionArrayTy, DestBase,
2413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2414 switch (Action) {
2416 InsertPointTy CurIP = Builder.saveIP();
2417 Builder.restoreIP(AllocaIP);
2418 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2419 ".omp.reduction.element");
2420 DestAlloca->setAlignment(
2421 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2422 DestElementAddr = DestAlloca;
2423 DestElementAddr =
2424 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2425 DestElementAddr->getName() + ".ascast");
2426 Builder.restoreIP(CurIP);
2427 ShuffleInElement = true;
2428 UpdateDestListPtr = true;
2429 break;
2430 }
2432 DestElementAddr =
2433 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2434 break;
2435 }
2436 }
2437
2438 // Now that all active lanes have read the element in the
2439 // Reduce list, shuffle over the value from the remote lane.
2440 if (ShuffleInElement) {
2441 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2442 RemoteLaneOffset, ReductionArrayTy);
2443 } else {
2444 switch (RI.EvaluationKind) {
2445 case EvalKind::Scalar: {
2446 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2447 // Store the source element value to the dest element address.
2448 Builder.CreateStore(Elem, DestElementAddr);
2449 break;
2450 }
2451 case EvalKind::Complex: {
2453 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2454 Value *SrcReal = Builder.CreateLoad(
2455 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2457 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2458 Value *SrcImg = Builder.CreateLoad(
2459 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2460
2462 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2464 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2465 Builder.CreateStore(SrcReal, DestRealPtr);
2466 Builder.CreateStore(SrcImg, DestImgPtr);
2467 break;
2468 }
2469 case EvalKind::Aggregate: {
2470 Value *SizeVal = Builder.getInt64(
2471 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2473 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2474 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2475 SizeVal, false);
2476 break;
2477 }
2478 };
2479 }
2480
2481 // Step 3.1: Modify reference in dest Reduce list as needed.
2482 // Modifying the reference in Reduce list to point to the newly
2483 // created element. The element is live in the current function
2484 // scope and that of functions it invokes (i.e., reduce_function).
2485 // RemoteReduceData[i] = (void*)&RemoteElem
2486 if (UpdateDestListPtr) {
2488 DestElementAddr, Builder.getPtrTy(),
2489 DestElementAddr->getName() + ".ascast");
2490 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2491 }
2492 }
2493}
2494
2495Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
2496 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2497 AttributeList FuncAttrs) {
2498 InsertPointTy SavedIP = Builder.saveIP();
2499 LLVMContext &Ctx = M.getContext();
2501 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2502 /* IsVarArg */ false);
2503 Function *WcFunc =
2505 "_omp_reduction_inter_warp_copy_func", &M);
2506 WcFunc->setAttributes(FuncAttrs);
2507 WcFunc->addParamAttr(0, Attribute::NoUndef);
2508 WcFunc->addParamAttr(1, Attribute::NoUndef);
2509 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2510 Builder.SetInsertPoint(EntryBB);
2511
2512 // ReduceList: thread local Reduce list.
2513 // At the stage of the computation when this function is called, partially
2514 // aggregated values reside in the first lane of every active warp.
2515 Argument *ReduceListArg = WcFunc->getArg(0);
2516 // NumWarps: number of warps active in the parallel region. This could
2517 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2518 Argument *NumWarpsArg = WcFunc->getArg(1);
2519
2520 // This array is used as a medium to transfer, one reduce element at a time,
2521 // the data from the first lane of every warp to lanes in the first warp
2522 // in order to perform the final step of a reduction in a parallel region
2523 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2524 // for reduced latency, as well as to have a distinct copy for concurrently
2525 // executing target regions. The array is declared with common linkage so
2526 // as to be shared across compilation units.
2527 StringRef TransferMediumName =
2528 "__openmp_nvptx_data_transfer_temporary_storage";
2529 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2530 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2531 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2532 if (!TransferMedium) {
2533 TransferMedium = new GlobalVariable(
2534 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2535 UndefValue::get(ArrayTy), TransferMediumName,
2536 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2537 /*AddressSpace=*/3);
2538 }
2539
2540 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2541 Value *GPUThreadID = getGPUThreadID();
2542 // nvptx_lane_id = nvptx_id % warpsize
2543 Value *LaneID = getNVPTXLaneID();
2544 // nvptx_warp_id = nvptx_id / warpsize
2545 Value *WarpID = getNVPTXWarpID();
2546
2547 InsertPointTy AllocaIP =
2550 Type *Arg0Type = ReduceListArg->getType();
2551 Type *Arg1Type = NumWarpsArg->getType();
2552 Builder.restoreIP(AllocaIP);
2553 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2554 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2555 AllocaInst *NumWarpsAlloca =
2556 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2558 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2560 NumWarpsAlloca, Arg1Type->getPointerTo(),
2561 NumWarpsAlloca->getName() + ".ascast");
2562 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2563 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2564 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2565 InsertPointTy CodeGenIP =
2567 Builder.restoreIP(CodeGenIP);
2568
2569 Value *ReduceList =
2570 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2571
2572 for (auto En : enumerate(ReductionInfos)) {
2573 //
2574 // Warp master copies reduce element to transfer medium in __shared__
2575 // memory.
2576 //
2577 const ReductionInfo &RI = En.value();
2578 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2579 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2580 Type *CType = Builder.getIntNTy(TySize * 8);
2581
2582 unsigned NumIters = RealTySize / TySize;
2583 if (NumIters == 0)
2584 continue;
2585 Value *Cnt = nullptr;
2586 Value *CntAddr = nullptr;
2587 BasicBlock *PrecondBB = nullptr;
2588 BasicBlock *ExitBB = nullptr;
2589 if (NumIters > 1) {
2590 CodeGenIP = Builder.saveIP();
2591 Builder.restoreIP(AllocaIP);
2592 CntAddr =
2593 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2594
2595 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2596 CntAddr->getName() + ".ascast");
2597 Builder.restoreIP(CodeGenIP);
2599 CntAddr,
2600 /*Volatile=*/false);
2601 PrecondBB = BasicBlock::Create(Ctx, "precond");
2602 ExitBB = BasicBlock::Create(Ctx, "exit");
2603 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2604 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2605 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2606 /*Volatile=*/false);
2608 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2609 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2611 }
2612
2613 // kmpc_barrier.
2614 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2615 omp::Directive::OMPD_unknown,
2616 /* ForceSimpleCall */ false,
2617 /* CheckCancelFlag */ true);
2618 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2619 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2620 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2621
2622 // if (lane_id == 0)
2623 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2624 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2626
2627 // Reduce element = LocalReduceList[i]
2628 auto *RedListArrayTy =
2629 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2630 Type *IndexTy = Builder.getIndexTy(
2632 Value *ElemPtrPtr =
2633 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2634 {ConstantInt::get(IndexTy, 0),
2635 ConstantInt::get(IndexTy, En.index())});
2636 // elemptr = ((CopyType*)(elemptrptr)) + I
2637 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2638 if (NumIters > 1)
2639 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2640
2641 // Get pointer to location in transfer medium.
2642 // MediumPtr = &medium[warp_id]
2643 Value *MediumPtr = Builder.CreateInBoundsGEP(
2644 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2645 // elem = *elemptr
2646 //*MediumPtr = elem
2647 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2648 // Store the source element value to the dest element address.
2649 Builder.CreateStore(Elem, MediumPtr,
2650 /*IsVolatile*/ true);
2651 Builder.CreateBr(MergeBB);
2652
2653 // else
2655 Builder.CreateBr(MergeBB);
2656
2657 // endif
2659 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2660 omp::Directive::OMPD_unknown,
2661 /* ForceSimpleCall */ false,
2662 /* CheckCancelFlag */ true);
2663
2664 // Warp 0 copies reduce element from transfer medium
2665 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2666 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2667 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2668
2669 Value *NumWarpsVal =
2670 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2671 // Up to 32 threads in warp 0 are active.
2672 Value *IsActiveThread =
2673 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2674 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2675
2676 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2677
2678 // SecMediumPtr = &medium[tid]
2679 // SrcMediumVal = *SrcMediumPtr
2680 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2681 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2682 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2683 Value *TargetElemPtrPtr =
2684 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2685 {ConstantInt::get(IndexTy, 0),
2686 ConstantInt::get(IndexTy, En.index())});
2687 Value *TargetElemPtrVal =
2688 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2689 Value *TargetElemPtr = TargetElemPtrVal;
2690 if (NumIters > 1)
2691 TargetElemPtr =
2692 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2693
2694 // *TargetElemPtr = SrcMediumVal;
2695 Value *SrcMediumValue =
2696 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2697 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2698 Builder.CreateBr(W0MergeBB);
2699
2700 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2701 Builder.CreateBr(W0MergeBB);
2702
2703 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2704
2705 if (NumIters > 1) {
2706 Cnt = Builder.CreateNSWAdd(
2707 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2708 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2709
2710 auto *CurFn = Builder.GetInsertBlock()->getParent();
2711 emitBranch(PrecondBB);
2712 emitBlock(ExitBB, CurFn);
2713 }
2714 RealTySize %= TySize;
2715 }
2716 }
2717
2719 Builder.restoreIP(SavedIP);
2720
2721 return WcFunc;
2722}
2723
2724Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2725 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2726 AttributeList FuncAttrs) {
2727 LLVMContext &Ctx = M.getContext();
2728 FunctionType *FuncTy =
2730 {Builder.getPtrTy(), Builder.getInt16Ty(),
2731 Builder.getInt16Ty(), Builder.getInt16Ty()},
2732 /* IsVarArg */ false);
2733 Function *SarFunc =
2735 "_omp_reduction_shuffle_and_reduce_func", &M);
2736 SarFunc->setAttributes(FuncAttrs);
2737 SarFunc->addParamAttr(0, Attribute::NoUndef);
2738 SarFunc->addParamAttr(1, Attribute::NoUndef);
2739 SarFunc->addParamAttr(2, Attribute::NoUndef);
2740 SarFunc->addParamAttr(3, Attribute::NoUndef);
2741 SarFunc->addParamAttr(1, Attribute::SExt);
2742 SarFunc->addParamAttr(2, Attribute::SExt);
2743 SarFunc->addParamAttr(3, Attribute::SExt);
2744 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2745 Builder.SetInsertPoint(EntryBB);
2746
2747 // Thread local Reduce list used to host the values of data to be reduced.
2748 Argument *ReduceListArg = SarFunc->getArg(0);
2749 // Current lane id; could be logical.
2750 Argument *LaneIDArg = SarFunc->getArg(1);
2751 // Offset of the remote source lane relative to the current lane.
2752 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2753 // Algorithm version. This is expected to be known at compile time.
2754 Argument *AlgoVerArg = SarFunc->getArg(3);
2755
2756 Type *ReduceListArgType = ReduceListArg->getType();
2757 Type *LaneIDArgType = LaneIDArg->getType();
2758 Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
2759 Value *ReduceListAlloca = Builder.CreateAlloca(
2760 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2761 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2762 LaneIDArg->getName() + ".addr");
2763 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2764 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2765 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2766 AlgoVerArg->getName() + ".addr");
2767 ArrayType *RedListArrayTy =
2768 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2769
2770 // Create a local thread-private variable to host the Reduce list
2771 // from a remote lane.
2772 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2773 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2774
2776 ReduceListAlloca, ReduceListArgType,
2777 ReduceListAlloca->getName() + ".ascast");
2779 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2780 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2781 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2782 RemoteLaneOffsetAlloca->getName() + ".ascast");
2784 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2786 RemoteReductionListAlloca, Builder.getPtrTy(),
2787 RemoteReductionListAlloca->getName() + ".ascast");
2788
2789 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2790 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2791 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2792 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2793
2794 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2795 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2796 Value *RemoteLaneOffset =
2797 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2798 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2799
2800 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2801
2802 // This loop iterates through the list of reduce elements and copies,
2803 // element by element, from a remote lane in the warp to RemoteReduceList,
2804 // hosted on the thread's stack.
2805 emitReductionListCopy(
2806 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2807 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2808
2809 // The actions to be performed on the Remote Reduce list is dependent
2810 // on the algorithm version.
2811 //
2812 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2813 // LaneId % 2 == 0 && Offset > 0):
2814 // do the reduction value aggregation
2815 //
2816 // The thread local variable Reduce list is mutated in place to host the
2817 // reduced data, which is the aggregated value produced from local and
2818 // remote lanes.
2819 //
2820 // Note that AlgoVer is expected to be a constant integer known at compile
2821 // time.
2822 // When AlgoVer==0, the first conjunction evaluates to true, making
2823 // the entire predicate true during compile time.
2824 // When AlgoVer==1, the second conjunction has only the second part to be
2825 // evaluated during runtime. Other conjunctions evaluates to false
2826 // during compile time.
2827 // When AlgoVer==2, the third conjunction has only the second part to be
2828 // evaluated during runtime. Other conjunctions evaluates to false
2829 // during compile time.
2830 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2831 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2832 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2833 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2834 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2835 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2836 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2837 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2838 Value *RemoteOffsetComp =
2839 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2840 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2841 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2842 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2843
2844 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2845 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2846 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2847
2848 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2851 ReduceList, Builder.getPtrTy());
2852 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2853 RemoteListAddrCast, Builder.getPtrTy());
2854 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2855 ->addFnAttr(Attribute::NoUnwind);
2856 Builder.CreateBr(MergeBB);
2857
2859 Builder.CreateBr(MergeBB);
2860
2862
2863 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2864 // Reduce list.
2865 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2866 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2867 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2868
2869 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2870 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2871 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2872 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2873
2874 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2875 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2876 ReductionInfos, RemoteListAddrCast, ReduceList);
2877 Builder.CreateBr(CpyMergeBB);
2878
2879 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2880 Builder.CreateBr(CpyMergeBB);
2881
2882 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2883
2885
2886 return SarFunc;
2887}
2888
2889Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2890 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2891 AttributeList FuncAttrs) {
2893 LLVMContext &Ctx = M.getContext();
2896 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2897 /* IsVarArg */ false);
2898 Function *LtGCFunc =
2900 "_omp_reduction_list_to_global_copy_func", &M);
2901 LtGCFunc->setAttributes(FuncAttrs);
2902 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2903 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2904 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2905
2906 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2907 Builder.SetInsertPoint(EntryBlock);
2908
2909 // Buffer: global reduction buffer.
2910 Argument *BufferArg = LtGCFunc->getArg(0);
2911 // Idx: index of the buffer.
2912 Argument *IdxArg = LtGCFunc->getArg(1);
2913 // ReduceList: thread local Reduce list.
2914 Argument *ReduceListArg = LtGCFunc->getArg(2);
2915
2916 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2917 BufferArg->getName() + ".addr");
2918 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2919 IdxArg->getName() + ".addr");
2920 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2921 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2923 BufferArgAlloca, Builder.getPtrTy(),
2924 BufferArgAlloca->getName() + ".ascast");
2926 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2927 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2928 ReduceListArgAlloca, Builder.getPtrTy(),
2929 ReduceListArgAlloca->getName() + ".ascast");
2930
2931 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2932 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2933 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2934
2935 Value *LocalReduceList =
2936 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2937 Value *BufferArgVal =
2938 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2939 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2940 Type *IndexTy = Builder.getIndexTy(
2942 for (auto En : enumerate(ReductionInfos)) {
2943 const ReductionInfo &RI = En.value();
2944 auto *RedListArrayTy =
2945 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2946 // Reduce element = LocalReduceList[i]
2947 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
2948 RedListArrayTy, LocalReduceList,
2949 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2950 // elemptr = ((CopyType*)(elemptrptr)) + I
2951 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2952
2953 // Global = Buffer.VD[Idx];
2954 Value *BufferVD =
2955 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
2957 ReductionsBufferTy, BufferVD, 0, En.index());
2958
2959 switch (RI.EvaluationKind) {
2960 case EvalKind::Scalar: {
2961 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
2962 Builder.CreateStore(TargetElement, GlobVal);
2963 break;
2964 }
2965 case EvalKind::Complex: {
2967 RI.ElementType, ElemPtr, 0, 0, ".realp");
2968 Value *SrcReal = Builder.CreateLoad(
2969 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2971 RI.ElementType, ElemPtr, 0, 1, ".imagp");
2972 Value *SrcImg = Builder.CreateLoad(
2973 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2974
2976 RI.ElementType, GlobVal, 0, 0, ".realp");
2978 RI.ElementType, GlobVal, 0, 1, ".imagp");
2979 Builder.CreateStore(SrcReal, DestRealPtr);
2980 Builder.CreateStore(SrcImg, DestImgPtr);
2981 break;
2982 }
2983 case EvalKind::Aggregate: {
2984 Value *SizeVal =
2985 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
2987 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
2988 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
2989 break;
2990 }
2991 }
2992 }
2993
2995 Builder.restoreIP(OldIP);
2996 return LtGCFunc;
2997}
2998
2999Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3000 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3001 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3003 LLVMContext &Ctx = M.getContext();
3006 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3007 /* IsVarArg */ false);
3008 Function *LtGRFunc =
3010 "_omp_reduction_list_to_global_reduce_func", &M);
3011 LtGRFunc->setAttributes(FuncAttrs);
3012 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3013 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3014 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3015
3016 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3017 Builder.SetInsertPoint(EntryBlock);
3018
3019 // Buffer: global reduction buffer.
3020 Argument *BufferArg = LtGRFunc->getArg(0);
3021 // Idx: index of the buffer.
3022 Argument *IdxArg = LtGRFunc->getArg(1);
3023 // ReduceList: thread local Reduce list.
3024 Argument *ReduceListArg = LtGRFunc->getArg(2);
3025
3026 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3027 BufferArg->getName() + ".addr");
3028 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3029 IdxArg->getName() + ".addr");
3030 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3031 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3032 auto *RedListArrayTy =
3033 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3034
3035 // 1. Build a list of reduction variables.
3036 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3037 Value *LocalReduceList =
3038 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3039
3041 BufferArgAlloca, Builder.getPtrTy(),
3042 BufferArgAlloca->getName() + ".ascast");
3044 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3045 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3046 ReduceListArgAlloca, Builder.getPtrTy(),
3047 ReduceListArgAlloca->getName() + ".ascast");
3048 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3049 LocalReduceList, Builder.getPtrTy(),
3050 LocalReduceList->getName() + ".ascast");
3051
3052 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3053 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3054 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3055
3056 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3057 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3058 Type *IndexTy = Builder.getIndexTy(
3060 for (auto En : enumerate(ReductionInfos)) {
3061 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3062 RedListArrayTy, LocalReduceListAddrCast,
3063 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3064 Value *BufferVD =
3065 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3066 // Global = Buffer.VD[Idx];
3068 ReductionsBufferTy, BufferVD, 0, En.index());
3069 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3070 }
3071
3072 // Call reduce_function(GlobalReduceList, ReduceList)
3073 Value *ReduceList =
3074 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3075 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3076 ->addFnAttr(Attribute::NoUnwind);
3078 Builder.restoreIP(OldIP);
3079 return LtGRFunc;
3080}
3081
3082Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3083 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3084 AttributeList FuncAttrs) {
3086 LLVMContext &Ctx = M.getContext();
3089 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3090 /* IsVarArg */ false);
3091 Function *LtGCFunc =
3093 "_omp_reduction_global_to_list_copy_func", &M);
3094 LtGCFunc->setAttributes(FuncAttrs);
3095 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3096 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3097 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3098
3099 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3100 Builder.SetInsertPoint(EntryBlock);
3101
3102 // Buffer: global reduction buffer.
3103 Argument *BufferArg = LtGCFunc->getArg(0);
3104 // Idx: index of the buffer.
3105 Argument *IdxArg = LtGCFunc->getArg(1);
3106 // ReduceList: thread local Reduce list.
3107 Argument *ReduceListArg = LtGCFunc->getArg(2);
3108
3109 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3110 BufferArg->getName() + ".addr");
3111 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3112 IdxArg->getName() + ".addr");
3113 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3114 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3116 BufferArgAlloca, Builder.getPtrTy(),
3117 BufferArgAlloca->getName() + ".ascast");
3119 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3120 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3121 ReduceListArgAlloca, Builder.getPtrTy(),
3122 ReduceListArgAlloca->getName() + ".ascast");
3123 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3124 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3125 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3126
3127 Value *LocalReduceList =
3128 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3129 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3130 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3131 Type *IndexTy = Builder.getIndexTy(
3133 for (auto En : enumerate(ReductionInfos)) {
3134 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3135 auto *RedListArrayTy =
3136 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3137 // Reduce element = LocalReduceList[i]
3138 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3139 RedListArrayTy, LocalReduceList,
3140 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3141 // elemptr = ((CopyType*)(elemptrptr)) + I
3142 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3143 // Global = Buffer.VD[Idx];
3144 Value *BufferVD =
3145 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3147 ReductionsBufferTy, BufferVD, 0, En.index());
3148
3149 switch (RI.EvaluationKind) {
3150 case EvalKind::Scalar: {
3151 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3152 Builder.CreateStore(TargetElement, ElemPtr);
3153 break;
3154 }
3155 case EvalKind::Complex: {
3157 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3158 Value *SrcReal = Builder.CreateLoad(
3159 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3161 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3162 Value *SrcImg = Builder.CreateLoad(
3163 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3164
3166 RI.ElementType, ElemPtr, 0, 0, ".realp");
3168 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3169 Builder.CreateStore(SrcReal, DestRealPtr);
3170 Builder.CreateStore(SrcImg, DestImgPtr);
3171 break;
3172 }
3173 case EvalKind::Aggregate: {
3174 Value *SizeVal =
3178 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3179 SizeVal, false);
3180 break;
3181 }
3182 }
3183 }
3184
3186 Builder.restoreIP(OldIP);
3187 return LtGCFunc;
3188}
3189
3190Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3191 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3192 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3194 LLVMContext &Ctx = M.getContext();
3195 auto *FuncTy = FunctionType::get(
3197 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3198 /* IsVarArg */ false);
3199 Function *LtGRFunc =
3201 "_omp_reduction_global_to_list_reduce_func", &M);
3202 LtGRFunc->setAttributes(FuncAttrs);
3203 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3204 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3205 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3206
3207 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3208 Builder.SetInsertPoint(EntryBlock);
3209
3210 // Buffer: global reduction buffer.
3211 Argument *BufferArg = LtGRFunc->getArg(0);
3212 // Idx: index of the buffer.
3213 Argument *IdxArg = LtGRFunc->getArg(1);
3214 // ReduceList: thread local Reduce list.
3215 Argument *ReduceListArg = LtGRFunc->getArg(2);
3216
3217 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3218 BufferArg->getName() + ".addr");
3219 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3220 IdxArg->getName() + ".addr");
3221 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3222 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3223 ArrayType *RedListArrayTy =
3224 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3225
3226 // 1. Build a list of reduction variables.
3227 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3228 Value *LocalReduceList =
3229 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3230
3232 BufferArgAlloca, Builder.getPtrTy(),
3233 BufferArgAlloca->getName() + ".ascast");
3235 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3236 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3237 ReduceListArgAlloca, Builder.getPtrTy(),
3238 ReduceListArgAlloca->getName() + ".ascast");
3240 LocalReduceList, Builder.getPtrTy(),
3241 LocalReduceList->getName() + ".ascast");
3242
3243 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3244 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3245 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3246
3247 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3248 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3249 Type *IndexTy = Builder.getIndexTy(
3251 for (auto En : enumerate(ReductionInfos)) {
3252 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3253 RedListArrayTy, ReductionList,
3254 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3255 // Global = Buffer.VD[Idx];
3256 Value *BufferVD =
3257 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3259 ReductionsBufferTy, BufferVD, 0, En.index());
3260 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3261 }
3262
3263 // Call reduce_function(ReduceList, GlobalReduceList)
3264 Value *ReduceList =
3265 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3266 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3267 ->addFnAttr(Attribute::NoUnwind);
3269 Builder.restoreIP(OldIP);
3270 return LtGRFunc;
3271}
3272
3273std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3274 std::string Suffix =
3275 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3276 return (Name + Suffix).str();
3277}
3278
3279Function *OpenMPIRBuilder::createReductionFunction(
3280 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3281 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3282 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3283 {Builder.getPtrTy(), Builder.getPtrTy()},
3284 /* IsVarArg */ false);
3285 std::string Name = getReductionFuncName(ReducerName);
3286 Function *ReductionFunc =
3288 ReductionFunc->setAttributes(FuncAttrs);
3289 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3290 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3291 BasicBlock *EntryBB =
3292 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3293 Builder.SetInsertPoint(EntryBB);
3294
3295 // Need to alloca memory here and deal with the pointers before getting
3296 // LHS/RHS pointers out
3297 Value *LHSArrayPtr = nullptr;
3298 Value *RHSArrayPtr = nullptr;
3299 Argument *Arg0 = ReductionFunc->getArg(0);
3300 Argument *Arg1 = ReductionFunc->getArg(1);
3301 Type *Arg0Type = Arg0->getType();
3302 Type *Arg1Type = Arg1->getType();
3303
3304 Value *LHSAlloca =
3305 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3306 Value *RHSAlloca =
3307 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3309 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3311 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3312 Builder.CreateStore(Arg0, LHSAddrCast);
3313 Builder.CreateStore(Arg1, RHSAddrCast);
3314 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3315 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3316
3317 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3318 Type *IndexTy = Builder.getIndexTy(
3320 SmallVector<Value *> LHSPtrs, RHSPtrs;
3321 for (auto En : enumerate(ReductionInfos)) {
3322 const ReductionInfo &RI = En.value();
3323 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3324 RedArrayTy, RHSArrayPtr,
3325 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3326 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3328 RHSI8Ptr, RI.PrivateVariable->getType(),
3329 RHSI8Ptr->getName() + ".ascast");
3330
3331 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3332 RedArrayTy, LHSArrayPtr,
3333 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3334 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3336 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3337
3339 LHSPtrs.emplace_back(LHSPtr);
3340 RHSPtrs.emplace_back(RHSPtr);
3341 } else {
3342 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3343 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3344 Value *Reduced;
3345 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3346 if (!Builder.GetInsertBlock())
3347 return ReductionFunc;
3348 Builder.CreateStore(Reduced, LHSPtr);
3349 }
3350 }
3351
3353 for (auto En : enumerate(ReductionInfos)) {
3354 unsigned Index = En.index();
3355 const ReductionInfo &RI = En.value();
3356 Value *LHSFixupPtr, *RHSFixupPtr;
3357 Builder.restoreIP(RI.ReductionGenClang(
3358 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3359
3360 // Fix the CallBack code genereated to use the correct Values for the LHS
3361 // and RHS
3362 LHSFixupPtr->replaceUsesWithIf(
3363 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3364 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3365 ReductionFunc;
3366 });
3367 RHSFixupPtr->replaceUsesWithIf(
3368 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3369 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3370 ReductionFunc;
3371 });
3372 }
3373
3375 return ReductionFunc;
3376}
3377
3378static void
3380 bool IsGPU) {
3381 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3382 (void)RI;
3383 assert(RI.Variable && "expected non-null variable");
3384 assert(RI.PrivateVariable && "expected non-null private variable");
3385 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3386 "expected non-null reduction generator callback");
3387 if (!IsGPU) {
3388 assert(
3389 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3390 "expected variables and their private equivalents to have the same "
3391 "type");
3392 }
3393 assert(RI.Variable->getType()->isPointerTy() &&
3394 "expected variables to be pointers");
3395 }
3396}
3397
3399 const LocationDescription &Loc, InsertPointTy AllocaIP,
3400 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3401 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3402 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3403 unsigned ReductionBufNum, Value *SrcLocInfo) {
3404 if (!updateToLocation(Loc))
3405 return InsertPointTy();
3406 Builder.restoreIP(CodeGenIP);
3407 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3408 LLVMContext &Ctx = M.getContext();
3409
3410 // Source location for the ident struct
3411 if (!SrcLocInfo) {
3412 uint32_t SrcLocStrSize;
3413 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3414 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3415 }
3416
3417 if (ReductionInfos.size() == 0)
3418 return Builder.saveIP();
3419
3420 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3421 AttributeList FuncAttrs;
3422 AttrBuilder AttrBldr(Ctx);
3423 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3424 AttrBldr.addAttribute(Attr);
3425 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3426 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3427
3428 Function *ReductionFunc = nullptr;
3429 CodeGenIP = Builder.saveIP();
3430 ReductionFunc =
3431 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3432 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3433 Builder.restoreIP(CodeGenIP);
3434
3435 // Set the grid value in the config needed for lowering later on
3436 if (GridValue.has_value())
3437 Config.setGridValue(GridValue.value());
3438 else
3439 Config.setGridValue(getGridValue(T, ReductionFunc));
3440
3441 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3442 // RedList, shuffle_reduce_func, interwarp_copy_func);
3443 // or
3444 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3445 Value *Res;
3446
3447 // 1. Build a list of reduction variables.
3448 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3449 auto Size = ReductionInfos.size();
3450 Type *PtrTy = PointerType::getUnqual(Ctx);
3451 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3452 CodeGenIP = Builder.saveIP();
3453 Builder.restoreIP(AllocaIP);
3454 Value *ReductionListAlloca =
3455 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3457 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3458 Builder.restoreIP(CodeGenIP);
3459 Type *IndexTy = Builder.getIndexTy(
3461 for (auto En : enumerate(ReductionInfos)) {
3462 const ReductionInfo &RI = En.value();
3463 Value *ElemPtr = Builder.CreateInBoundsGEP(
3464 RedArrayTy, ReductionList,
3465 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3466 Value *CastElem =
3468 Builder.CreateStore(CastElem, ElemPtr);
3469 }
3470 CodeGenIP = Builder.saveIP();
3471 Function *SarFunc =
3472 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3473 Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3474 Builder.restoreIP(CodeGenIP);
3475
3476 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3477
3478 unsigned MaxDataSize = 0;
3479 SmallVector<Type *> ReductionTypeArgs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3482 if (Size > MaxDataSize)
3483 MaxDataSize = Size;
3484 ReductionTypeArgs.emplace_back(En.value().ElementType);
3485 }
3486 Value *ReductionDataSize =
3487 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3488 if (!IsTeamsReduction) {
3489 Value *SarFuncCast =
3491 Value *WcFuncCast =
3493 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3494 WcFuncCast};
3496 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3497 Res = Builder.CreateCall(Pv2Ptr, Args);
3498 } else {
3499 CodeGenIP = Builder.saveIP();
3500 StructType *ReductionsBufferTy = StructType::create(
3501 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3502 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3503 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3504 Function *LtGCFunc = emitListToGlobalCopyFunction(
3505 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3506 Function *LtGRFunc = emitListToGlobalReduceFunction(
3507 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3508 Function *GtLCFunc = emitGlobalToListCopyFunction(
3509 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3510 Function *GtLRFunc = emitGlobalToListReduceFunction(
3511 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3512 Builder.restoreIP(CodeGenIP);
3513
3514 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3515 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3516
3517 Value *Args3[] = {SrcLocInfo,
3518 KernelTeamsReductionPtr,
3519 Builder.getInt32(ReductionBufNum),
3520 ReductionDataSize,
3521 RL,
3522 SarFunc,
3523 WcFunc,
3524 LtGCFunc,
3525 LtGRFunc,
3526 GtLCFunc,
3527 GtLRFunc};
3528
3529 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3530 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3531 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3532 }
3533
3534 // 5. Build if (res == 1)
3535 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3536 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3538 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3539
3540 // 6. Build then branch: where we have reduced values in the master
3541 // thread in each team.
3542 // __kmpc_end_reduce{_nowait}(<gtid>);
3543 // break;
3544 emitBlock(ThenBB, CurFunc);
3545
3546 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3547 for (auto En : enumerate(ReductionInfos)) {
3548 const ReductionInfo &RI = En.value();
3549 Value *LHS = RI.Variable;
3550 Value *RHS =
3552
3554 Value *LHSPtr, *RHSPtr;
3556 &LHSPtr, &RHSPtr, CurFunc));
3557
3558 // Fix the CallBack code genereated to use the correct Values for the LHS
3559 // and RHS
3560 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3561 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3562 ReductionFunc;
3563 });
3564 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3565 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3566 ReductionFunc;
3567 });
3568 } else {
3569 assert(false && "Unhandled ReductionGenCBKind");
3570 }
3571 }
3572 emitBlock(ExitBB, CurFunc);
3573
3575
3576 return Builder.saveIP();
3577}
3578
3580 Type *VoidTy = Type::getVoidTy(M.getContext());
3581 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3582 auto *FuncTy =
3583 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3585 ".omp.reduction.func", &M);
3586}
3587
3590 InsertPointTy AllocaIP,
3591 ArrayRef<ReductionInfo> ReductionInfos,
3592 ArrayRef<bool> IsByRef, bool IsNoWait) {
3593 assert(ReductionInfos.size() == IsByRef.size());
3594 for (const ReductionInfo &RI : ReductionInfos) {
3595 (void)RI;
3596 assert(RI.Variable && "expected non-null variable");
3597 assert(RI.PrivateVariable && "expected non-null private variable");
3598 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3599 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3600 "expected variables and their private equivalents to have the same "
3601 "type");
3602 assert(RI.Variable->getType()->isPointerTy() &&
3603 "expected variables to be pointers");
3604 }
3605
3606 if (!updateToLocation(Loc))
3607 return InsertPointTy();
3608
3609 BasicBlock *InsertBlock = Loc.IP.getBlock();
3610 BasicBlock *ContinuationBlock =
3611 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3612 InsertBlock->getTerminator()->eraseFromParent();
3613
3614 // Create and populate array of type-erased pointers to private reduction
3615 // values.
3616 unsigned NumReductions = ReductionInfos.size();
3617 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3619 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3620
3621 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3622
3623 for (auto En : enumerate(ReductionInfos)) {
3624 unsigned Index = En.index();
3625 const ReductionInfo &RI = En.value();
3626 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3627 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3628 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3629 }
3630
3631 // Emit a call to the runtime function that orchestrates the reduction.
3632 // Declare the reduction function in the process.
3634 Module *Module = Func->getParent();
3635 uint32_t SrcLocStrSize;
3636 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3637 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3638 return RI.AtomicReductionGen;
3639 });
3640 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3641 CanGenerateAtomic
3642 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3643 : IdentFlag(0));
3644 Value *ThreadId = getOrCreateThreadID(Ident);
3645 Constant *NumVariables = Builder.getInt32(NumReductions);
3646 const DataLayout &DL = Module->getDataLayout();
3647 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3648 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3649 Function *ReductionFunc = getFreshReductionFunc(*Module);
3650 Value *Lock = getOMPCriticalRegionLock(".reduction");
3652 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3653 : RuntimeFunction::OMPRTL___kmpc_reduce);
3654 CallInst *ReduceCall =
3655 Builder.CreateCall(ReduceFunc,
3656 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3657 ReductionFunc, Lock},
3658 "reduce");
3659
3660 // Create final reduction entry blocks for the atomic and non-atomic case.
3661 // Emit IR that dispatches control flow to one of the blocks based on the
3662 // reduction supporting the atomic mode.
3663 BasicBlock *NonAtomicRedBlock =
3664 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3665 BasicBlock *AtomicRedBlock =
3666 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3667 SwitchInst *Switch =
3668 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3669 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3670 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3671
3672 // Populate the non-atomic reduction using the elementwise reduction function.
3673 // This loads the elements from the global and private variables and reduces
3674 // them before storing back the result to the global variable.
3675 Builder.SetInsertPoint(NonAtomicRedBlock);
3676 for (auto En : enumerate(ReductionInfos)) {
3677 const ReductionInfo &RI = En.value();
3679 // We have one less load for by-ref case because that load is now inside of
3680 // the reduction region
3681 Value *RedValue = nullptr;
3682 if (!IsByRef[En.index()]) {
3683 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3684 "red.value." + Twine(En.index()));
3685 }
3686 Value *PrivateRedValue =
3688 "red.private.value." + Twine(En.index()));
3689 Value *Reduced;
3690 if (IsByRef[En.index()]) {
3692 PrivateRedValue, Reduced));
3693 } else {
3695 PrivateRedValue, Reduced));
3696 }
3697 if (!Builder.GetInsertBlock())
3698 return InsertPointTy();
3699 // for by-ref case, the load is inside of the reduction region
3700 if (!IsByRef[En.index()])
3701 Builder.CreateStore(Reduced, RI.Variable);
3702 }
3703 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3704 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3705 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3706 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3707 Builder.CreateBr(ContinuationBlock);
3708
3709 // Populate the atomic reduction using the atomic elementwise reduction
3710 // function. There are no loads/stores here because they will be happening
3711 // inside the atomic elementwise reduction.
3712 Builder.SetInsertPoint(AtomicRedBlock);
3713 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3714 for (const ReductionInfo &RI : ReductionInfos) {
3716 RI.Variable, RI.PrivateVariable));
3717 if (!Builder.GetInsertBlock())
3718 return InsertPointTy();
3719 }
3720 Builder.CreateBr(ContinuationBlock);
3721 } else {
3723 }
3724
3725 // Populate the outlined reduction function using the elementwise reduction
3726 // function. Partial values are extracted from the type-erased array of
3727 // pointers to private variables.
3728 BasicBlock *ReductionFuncBlock =
3729 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3730 Builder.SetInsertPoint(ReductionFuncBlock);
3731 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3732 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3733
3734 for (auto En : enumerate(ReductionInfos)) {
3735 const ReductionInfo &RI = En.value();
3737 RedArrayTy, LHSArrayPtr, 0, En.index());
3738 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3739 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3740 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3742 RedArrayTy, RHSArrayPtr, 0, En.index());
3743 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3744 Value *RHSPtr =
3746 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3747 Value *Reduced;
3749 if (!Builder.GetInsertBlock())
3750 return InsertPointTy();
3751 // store is inside of the reduction region when using by-ref
3752 if (!IsByRef[En.index()])
3753 Builder.CreateStore(Reduced, LHSPtr);
3754 }
3756
3757 Builder.SetInsertPoint(ContinuationBlock);
3758 return Builder.saveIP();
3759}
3760
3763 BodyGenCallbackTy BodyGenCB,
3764 FinalizeCallbackTy FiniCB) {
3765
3766 if (!updateToLocation(Loc))
3767 return Loc.IP;
3768
3769 Directive OMPD = Directive::OMPD_master;
3770 uint32_t SrcLocStrSize;
3771 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3772 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3773 Value *ThreadId = getOrCreateThreadID(Ident);
3774 Value *Args[] = {Ident, ThreadId};
3775
3776 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3777 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3778
3779 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3780 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3781
3782 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3783 /*Conditional*/ true, /*hasFinalize*/ true);
3784}
3785
3788 BodyGenCallbackTy BodyGenCB,
3789 FinalizeCallbackTy FiniCB, Value *Filter) {
3790 if (!updateToLocation(Loc))
3791 return Loc.IP;
3792
3793 Directive OMPD = Directive::OMPD_masked;
3794 uint32_t SrcLocStrSize;
3795 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3796 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3797 Value *ThreadId = getOrCreateThreadID(Ident);
3798 Value *Args[] = {Ident, ThreadId, Filter};
3799 Value *ArgsEnd[] = {Ident, ThreadId};
3800
3801 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3802 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3803
3804 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3805 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3806
3807 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3808 /*Conditional*/ true, /*hasFinalize*/ true);
3809}
3810
3812 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3813 BasicBlock *PostInsertBefore, const Twine &Name) {
3814 Module *M = F->getParent();
3815 LLVMContext &Ctx = M->getContext();
3816 Type *IndVarTy = TripCount->getType();
3817
3818 // Create the basic block structure.
3819 BasicBlock *Preheader =
3820 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3821 BasicBlock *Header =
3822 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3823 BasicBlock *Cond =
3824 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3825 BasicBlock *Body =
3826 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3827 BasicBlock *Latch =
3828 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3829 BasicBlock *Exit =
3830 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3831 BasicBlock *After =
3832 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3833
3834 // Use specified DebugLoc for new instructions.
3836
3837 Builder.SetInsertPoint(Preheader);
3838 Builder.CreateBr(Header);
3839
3840 Builder.SetInsertPoint(Header);
3841 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3842 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3844
3846 Value *Cmp =
3847 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3848 Builder.CreateCondBr(Cmp, Body, Exit);
3849
3850 Builder.SetInsertPoint(Body);
3851 Builder.CreateBr(Latch);
3852
3853 Builder.SetInsertPoint(Latch);
3854 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3855 "omp_" + Name + ".next", /*HasNUW=*/true);
3856 Builder.CreateBr(Header);
3857 IndVarPHI->addIncoming(Next, Latch);
3858
3859 Builder.SetInsertPoint(Exit);
3861
3862 // Remember and return the canonical control flow.
3863 LoopInfos.emplace_front();
3864 CanonicalLoopInfo *CL = &LoopInfos.front();
3865
3866 CL->Header = Header;
3867 CL->Cond = Cond;
3868 CL->Latch = Latch;
3869 CL->Exit = Exit;
3870
3871#ifndef NDEBUG
3872 CL->assertOK();
3873#endif
3874 return CL;
3875}
3876
3879 LoopBodyGenCallbackTy BodyGenCB,
3880 Value *TripCount, const Twine &Name) {
3881 BasicBlock *BB = Loc.IP.getBlock();
3882 BasicBlock *NextBB = BB->getNextNode();
3883
3884 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3885 NextBB, NextBB, Name);
3886 BasicBlock *After = CL->getAfter();
3887
3888 // If location is not set, don't connect the loop.
3889 if (updateToLocation(Loc)) {
3890 // Split the loop at the insertion point: Branch to the preheader and move
3891 // every following instruction to after the loop (the After BB). Also, the
3892 // new successor is the loop's after block.
3893 spliceBB(Builder, After, /*CreateBranch=*/false);
3895 }
3896
3897 // Emit the body content. We do it after connecting the loop to the CFG to
3898 // avoid that the callback encounters degenerate BBs.
3899 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
3900
3901#ifndef NDEBUG
3902 CL->assertOK();
3903#endif
3904 return CL;
3905}
3906
3908 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3909 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3910 InsertPointTy ComputeIP, const Twine &Name) {
3911
3912 // Consider the following difficulties (assuming 8-bit signed integers):
3913 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3914 // DO I = 1, 100, 50
3915 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3916 // DO I = 100, 0, -128
3917
3918 // Start, Stop and Step must be of the same integer type.
3919 auto *IndVarTy = cast<IntegerType>(Start->getType());
3920 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
3921 assert(IndVarTy == Step->getType() && "Step type mismatch");
3922
3923 LocationDescription ComputeLoc =
3924 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
3925 updateToLocation(ComputeLoc);
3926
3927 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
3928 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
3929
3930 // Like Step, but always positive.
3931 Value *Incr = Step;
3932
3933 // Distance between Start and Stop; always positive.
3934 Value *Span;
3935
3936 // Condition whether there are no iterations are executed at all, e.g. because
3937 // UB < LB.
3938 Value *ZeroCmp;
3939
3940 if (IsSigned) {
3941 // Ensure that increment is positive. If not, negate and invert LB and UB.
3942 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
3943 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
3944 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
3945 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
3946 Span = Builder.CreateSub(UB, LB, "", false, true);
3947 ZeroCmp = Builder.CreateICmp(
3948 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
3949 } else {
3950 Span = Builder.CreateSub(Stop, Start, "", true);
3951 ZeroCmp = Builder.CreateICmp(
3952 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
3953 }
3954
3955 Value *CountIfLooping;
3956 if (InclusiveStop) {
3957 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
3958 } else {
3959 // Avoid incrementing past stop since it could overflow.
3960 Value *CountIfTwo = Builder.CreateAdd(
3961 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
3962 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
3963 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
3964 }
3965 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
3966 "omp_" + Name + ".tripcount");
3967
3968 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
3969 Builder.restoreIP(CodeGenIP);
3970 Value *Span = Builder.CreateMul(IV, Step);
3971 Value *IndVar = Builder.CreateAdd(Span, Start);
3972 BodyGenCB(Builder.saveIP(), IndVar);
3973 };
3974 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
3975 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
3976}
3977
3978// Returns an LLVM function to call for initializing loop bounds using OpenMP
3979// static scheduling depending on `type`. Only i32 and i64 are supported by the
3980// runtime. Always interpret integers as unsigned similarly to
3981// CanonicalLoopInfo.
3983 OpenMPIRBuilder &OMPBuilder) {
3984 unsigned Bitwidth = Ty->getIntegerBitWidth();
3985 if (Bitwidth == 32)
3986 return OMPBuilder.getOrCreateRuntimeFunction(
3987 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
3988 if (Bitwidth == 64)
3989 return OMPBuilder.getOrCreateRuntimeFunction(
3990 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
3991 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3992}
3993
3995OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
3996 InsertPointTy AllocaIP,
3997 bool NeedsBarrier) {
3998 assert(CLI->isValid() && "Requires a valid canonical loop");
3999 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4000 "Require dedicated allocate IP");
4001
4002 // Set up the source location value for OpenMP runtime.
4005
4006 uint32_t SrcLocStrSize;
4007 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4008 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4009
4010 // Declare useful OpenMP runtime functions.
4011 Value *IV = CLI->getIndVar();
4012 Type *IVTy = IV->getType();
4013 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4014 FunctionCallee StaticFini =
4015 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4016
4017 // Allocate space for computed loop bounds as expected by the "init" function.
4018 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4019
4020 Type *I32Type = Type::getInt32Ty(M.getContext());
4021 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4022 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4023 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4024 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4025
4026 // At the end of the preheader, prepare for calling the "init" function by
4027 // storing the current loop bounds into the allocated space. A canonical loop
4028 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4029 // and produces an inclusive upper bound.
4031 Constant *Zero = ConstantInt::get(IVTy, 0);
4032 Constant *One = ConstantInt::get(IVTy, 1);
4033 Builder.CreateStore(Zero, PLowerBound);
4034 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4035 Builder.CreateStore(UpperBound, PUpperBound);
4036 Builder.CreateStore(One, PStride);
4037
4038 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4039
4040 Constant *SchedulingType = ConstantInt::get(
4041 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4042
4043 // Call the "init" function and update the trip count of the loop with the
4044 // value it produced.
4045 Builder.CreateCall(StaticInit,
4046 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4047 PUpperBound, PStride, One, Zero});
4048 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4049 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4050 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4051 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4052 CLI->setTripCount(TripCount);
4053
4054 // Update all uses of the induction variable except the one in the condition
4055 // block that compares it with the actual upper bound, and the increment in
4056 // the latch block.
4057
4058 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4060 CLI->getBody()->getFirstInsertionPt());
4062 return Builder.CreateAdd(OldIV, LowerBound);
4063 });
4064
4065 // In the "exit" block, call the "fini" function.
4067 CLI->getExit()->getTerminator()->getIterator());
4068 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4069
4070 // Add the barrier if requested.
4071 if (NeedsBarrier)
4072 createBarrier(LocationDescription(Builder.saveIP(), DL),
4073 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4074 /* CheckCancelFlag */ false);
4075
4076 InsertPointTy AfterIP = CLI->getAfterIP();
4077 CLI->invalidate();
4078
4079 return AfterIP;
4080}
4081
4082OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4083 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4084 bool NeedsBarrier, Value *ChunkSize) {
4085 assert(CLI->isValid() && "Requires a valid canonical loop");
4086 assert(ChunkSize && "Chunk size is required");
4087
4088 LLVMContext &Ctx = CLI->getFunction()->getContext();
4089 Value *IV = CLI->getIndVar();
4090 Value *OrigTripCount = CLI->getTripCount();
4091 Type *IVTy = IV->getType();
4092 assert(IVTy->getIntegerBitWidth() <= 64 &&
4093 "Max supported tripcount bitwidth is 64 bits");
4094 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4095 : Type::getInt64Ty(Ctx);
4096 Type *I32Type = Type::getInt32Ty(M.getContext());
4097 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4098 Constant *One = ConstantInt::get(InternalIVTy, 1);
4099
4100 // Declare useful OpenMP runtime functions.
4101 FunctionCallee StaticInit =
4102 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4103 FunctionCallee StaticFini =
4104 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4105
4106 // Allocate space for computed loop bounds as expected by the "init" function.
4107 Builder.restoreIP(AllocaIP);
4109 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4110 Value *PLowerBound =
4111 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4112 Value *PUpperBound =
4113 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4114 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4115
4116 // Set up the source location value for the OpenMP runtime.
4119
4120 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4121 Value *CastedChunkSize =
4122 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4123 Value *CastedTripCount =
4124 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4125
4126 Constant *SchedulingType = ConstantInt::get(
4127 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4128 Builder.CreateStore(Zero, PLowerBound);
4129 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4130 Builder.CreateStore(OrigUpperBound, PUpperBound);
4131 Builder.CreateStore(One, PStride);
4132
4133 // Call the "init" function and update the trip count of the loop with the
4134 // value it produced.
4135 uint32_t SrcLocStrSize;
4136 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4137 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4138 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4139 Builder.CreateCall(StaticInit,
4140 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4141 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4142 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4143 /*pstride=*/PStride, /*incr=*/One,
4144 /*chunk=*/CastedChunkSize});
4145
4146 // Load values written by the "init" function.
4147 Value *FirstChunkStart =
4148 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4149 Value *FirstChunkStop =
4150 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4151 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4152 Value *ChunkRange =
4153 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4154 Value *NextChunkStride =
4155 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4156
4157 // Create outer "dispatch" loop for enumerating the chunks.
4158 BasicBlock *DispatchEnter = splitBB(Builder, true);
4159 Value *DispatchCounter;
4161 {Builder.saveIP(), DL},
4162 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
4163 FirstChunkStart, CastedTripCount, NextChunkStride,
4164 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4165 "dispatch");
4166
4167 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4168 // not have to preserve the canonical invariant.
4169 BasicBlock *DispatchBody = DispatchCLI->getBody();
4170 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4171 BasicBlock *DispatchExit = DispatchCLI->getExit();
4172 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4173 DispatchCLI->invalidate();
4174
4175 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4176 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4177 redirectTo(CLI->getExit(), DispatchLatch, DL);
4178 redirectTo(DispatchBody, DispatchEnter, DL);
4179
4180 // Prepare the prolog of the chunk loop.
4183
4184 // Compute the number of iterations of the chunk loop.
4186 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4187 Value *IsLastChunk =
4188 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4189 Value *CountUntilOrigTripCount =
4190 Builder.CreateSub(CastedTripCount, DispatchCounter);
4191 Value *ChunkTripCount = Builder.CreateSelect(
4192 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4193 Value *BackcastedChunkTC =
4194 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4195 CLI->setTripCount(BackcastedChunkTC);
4196
4197 // Update all uses of the induction variable except the one in the condition
4198 // block that compares it with the actual upper bound, and the increment in
4199 // the latch block.
4200 Value *BackcastedDispatchCounter =
4201 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4202 CLI->mapIndVar([&](Instruction *) -> Value * {
4203 Builder.restoreIP(CLI->getBodyIP());
4204 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4205 });
4206
4207 // In the "exit" block, call the "fini" function.
4208 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4209 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4210
4211 // Add the barrier if requested.
4212 if (NeedsBarrier)
4213 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4214 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4215
4216#ifndef NDEBUG
4217 // Even though we currently do not support applying additional methods to it,
4218 // the chunk loop should remain a canonical loop.
4219 CLI->assertOK();
4220#endif
4221
4222 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
4223}
4224
4225// Returns an LLVM function to call for executing an OpenMP static worksharing
4226// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4227// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4228static FunctionCallee
4230 WorksharingLoopType LoopType) {
4231 unsigned Bitwidth = Ty->getIntegerBitWidth();
4232 Module &M = OMPBuilder->M;
4233 switch (LoopType) {
4234 case WorksharingLoopType::ForStaticLoop:
4235 if (Bitwidth == 32)
4236 return OMPBuilder->getOrCreateRuntimeFunction(
4237 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4238 if (Bitwidth == 64)
4239 return OMPBuilder->getOrCreateRuntimeFunction(
4240 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4241 break;
4242 case WorksharingLoopType::DistributeStaticLoop:
4243 if (Bitwidth == 32)
4244 return OMPBuilder->getOrCreateRuntimeFunction(
4245 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4246 if (Bitwidth == 64)
4247 return OMPBuilder->getOrCreateRuntimeFunction(
4248 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4249 break;
4250 case WorksharingLoopType::DistributeForStaticLoop:
4251 if (Bitwidth == 32)
4252 return OMPBuilder->getOrCreateRuntimeFunction(
4253 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4254 if (Bitwidth == 64)
4255 return OMPBuilder->getOrCreateRuntimeFunction(
4256 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4257 break;
4258 }
4259 if (Bitwidth != 32 && Bitwidth != 64) {
4260 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4261 }
4262 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4263}
4264
4265// Inserts a call to proper OpenMP Device RTL function which handles
4266// loop worksharing.
4268 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4269 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4270 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4271 Type *TripCountTy = TripCount->getType();
4272 Module &M = OMPBuilder->M;
4273 IRBuilder<> &Builder = OMPBuilder->Builder;
4274 FunctionCallee RTLFn =
4275 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4276 SmallVector<Value *, 8> RealArgs;
4277 RealArgs.push_back(Ident);
4278 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4279 RealArgs.push_back(LoopBodyArg);
4280 RealArgs.push_back(TripCount);
4281 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4282 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4283 Builder.CreateCall(RTLFn, RealArgs);
4284 return;
4285 }
4286 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4287 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4288 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4289 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4290
4291 RealArgs.push_back(
4292 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4293 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4294 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4295 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4296 }
4297
4298 Builder.CreateCall(RTLFn, RealArgs);
4299}
4300
4301static void
4303 CanonicalLoopInfo *CLI, Value *Ident,
4304 Function &OutlinedFn, Type *ParallelTaskPtr,
4305 const SmallVector<Instruction *, 4> &ToBeDeleted,
4306 WorksharingLoopType LoopType) {
4307 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4308 BasicBlock *Preheader = CLI->getPreheader();
4309 Value *TripCount = CLI->getTripCount();
4310
4311 // After loop body outling, the loop body contains only set up
4312 // of loop body argument structure and the call to the outlined
4313 // loop body function. Firstly, we need to move setup of loop body args
4314 // into loop preheader.
4315 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4316 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4317
4318 // The next step is to remove the whole loop. We do not it need anymore.
4319 // That's why make an unconditional branch from loop preheader to loop
4320 // exit block
4321 Builder.restoreIP({Preheader, Preheader->end()});
4322 Preheader->getTerminator()->eraseFromParent();
4323 Builder.CreateBr(CLI->getExit());
4324
4325 // Delete dead loop blocks
4326 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4327 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4328 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4329 CleanUpInfo.EntryBB = CLI->getHeader();
4330 CleanUpInfo.ExitBB = CLI->getExit();
4331 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4332 DeleteDeadBlocks(BlocksToBeRemoved);
4333
4334 // Find the instruction which corresponds to loop body argument structure
4335 // and remove the call to loop body function instruction.
4336 Value *LoopBodyArg;
4337 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4338 assert(OutlinedFnUser &&
4339 "Expected unique undroppable user of outlined function");
4340 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4341 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4342 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4343 "Expected outlined function call to be located in loop preheader");
4344 // Check in case no argument structure has been passed.
4345 if (OutlinedFnCallInstruction->arg_size() > 1)
4346 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4347 else
4348 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4349 OutlinedFnCallInstruction->eraseFromParent();
4350
4351 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4352 LoopBodyArg, ParallelTaskPtr, TripCount,
4353 OutlinedFn);
4354
4355 for (auto &ToBeDeletedItem : ToBeDeleted)
4356 ToBeDeletedItem->eraseFromParent();
4357 CLI->invalidate();
4358}
4359
4361OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4362 InsertPointTy AllocaIP,
4363 WorksharingLoopType LoopType) {
4364 uint32_t SrcLocStrSize;
4365 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4366 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4367
4368 OutlineInfo OI;
4369 OI.OuterAllocaBB = CLI->getPreheader();
4370 Function *OuterFn = CLI->getPreheader()->getParent();
4371
4372 // Instructions which need to be deleted at the end of code generation
4374
4375 OI.OuterAllocaBB = AllocaIP.getBlock();
4376
4377 // Mark the body loop as region which needs to be extracted
4378 OI.EntryBB = CLI->getBody();
4379 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4380 "omp.prelatch", true);
4381
4382 // Prepare loop body for extraction
4383 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4384
4385 // Insert new loop counter variable which will be used only in loop
4386 // body.
4387 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4388 Instruction *NewLoopCntLoad =
4389 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4390 // New loop counter instructions are redundant in the loop preheader when
4391 // code generation for workshare loop is finshed. That's why mark them as
4392 // ready for deletion.
4393 ToBeDeleted.push_back(NewLoopCntLoad);
4394 ToBeDeleted.push_back(NewLoopCnt);
4395
4396 // Analyse loop body region. Find all input variables which are used inside
4397 // loop body region.
4398 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4400 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4401 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4402 ParallelRegionBlockSet.end());
4403
4404 CodeExtractorAnalysisCache CEAC(*OuterFn);
4405 CodeExtractor Extractor(Blocks,
4406 /* DominatorTree */ nullptr,
4407 /* AggregateArgs */ true,
4408 /* BlockFrequencyInfo */ nullptr,
4409 /* BranchProbabilityInfo */ nullptr,
4410 /* AssumptionCache */ nullptr,
4411 /* AllowVarArgs */ true,
4412 /* AllowAlloca */ true,
4413 /* AllocationBlock */ CLI->getPreheader(),
4414 /* Suffix */ ".omp_wsloop",
4415 /* AggrArgsIn0AddrSpace */ true);
4416
4417 BasicBlock *CommonExit = nullptr;
4418 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4419
4420 // Find allocas outside the loop body region which are used inside loop
4421 // body
4422 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4423
4424 // We need to model loop body region as the function f(cnt, loop_arg).
4425 // That's why we replace loop induction variable by the new counter
4426 // which will be one of loop body function argument
4428 CLI->getIndVar()->user_end());
4429 for (auto Use : Users) {
4430 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4431 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4432 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4433 }
4434 }
4435 }
4436 // Make sure that loop counter variable is not merged into loop body
4437 // function argument structure and it is passed as separate variable
4438 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4439
4440 // PostOutline CB is invoked when loop body function is outlined and
4441 // loop body is replaced by call to outlined function. We need to add
4442 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4443 // function will handle loop control logic.
4444 //
4445 OI.PostOutlineCB = [=, ToBeDeletedVec =
4446 std::move(ToBeDeleted)](Function &OutlinedFn) {
4447 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4448 ToBeDeletedVec, LoopType);
4449 };
4450 addOutlineInfo(std::move(OI));
4451 return CLI->getAfterIP();
4452}
4453
4456 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4457 bool HasSimdModifier, bool HasMonotonicModifier,
4458 bool HasNonmonotonicModifier, bool HasOrderedClause,
4459 WorksharingLoopType LoopType) {
4460 if (Config.isTargetDevice())
4461 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4462 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4463 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4464 HasNonmonotonicModifier, HasOrderedClause);
4465
4466 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4467 OMPScheduleType::ModifierOrdered;
4468 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4469 case OMPScheduleType::BaseStatic:
4470 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4471 if (IsOrdered)
4472 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4473 NeedsBarrier, ChunkSize);
4474 // FIXME: Monotonicity ignored?
4475 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4476
4477 case OMPScheduleType::BaseStaticChunked:
4478 if (IsOrdered)
4479 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4480 NeedsBarrier, ChunkSize);
4481 // FIXME: Monotonicity ignored?
4482 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4483 ChunkSize);
4484
4485 case OMPScheduleType::BaseRuntime:
4486 case OMPScheduleType::BaseAuto:
4487 case OMPScheduleType::BaseGreedy:
4488 case OMPScheduleType::BaseBalanced:
4489 case OMPScheduleType::BaseSteal:
4490 case OMPScheduleType::BaseGuidedSimd:
4491 case OMPScheduleType::BaseRuntimeSimd:
4492 assert(!ChunkSize &&
4493 "schedule type does not support user-defined chunk sizes");
4494 [[fallthrough]];
4495 case OMPScheduleType::BaseDynamicChunked:
4496 case OMPScheduleType::BaseGuidedChunked:
4497 case OMPScheduleType::BaseGuidedIterativeChunked:
4498 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4499 case OMPScheduleType::BaseStaticBalancedChunked:
4500 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4501 NeedsBarrier, ChunkSize);
4502
4503 default:
4504 llvm_unreachable("Unknown/unimplemented schedule kind");
4505 }
4506}
4507
4508/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4509/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4510/// the runtime. Always interpret integers as unsigned similarly to
4511/// CanonicalLoopInfo.
4512static FunctionCallee
4514 unsigned Bitwidth = Ty->getIntegerBitWidth();
4515 if (Bitwidth == 32)
4516 return OMPBuilder.getOrCreateRuntimeFunction(
4517 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4518 if (Bitwidth == 64)
4519 return OMPBuilder.getOrCreateRuntimeFunction(
4520 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4521 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4522}
4523
4524/// Returns an LLVM function to call for updating the next loop using OpenMP
4525/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4526/// the runtime. Always interpret integers as unsigned similarly to
4527/// CanonicalLoopInfo.
4528static FunctionCallee
4530 unsigned Bitwidth = Ty->getIntegerBitWidth();
4531 if (Bitwidth == 32)
4532 return OMPBuilder.getOrCreateRuntimeFunction(
4533 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4534 if (Bitwidth == 64)
4535 return OMPBuilder.getOrCreateRuntimeFunction(
4536 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4537 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4538}
4539
4540/// Returns an LLVM function to call for finalizing the dynamic loop using
4541/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4542/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4543static FunctionCallee
4545 unsigned Bitwidth = Ty->getIntegerBitWidth();
4546 if (Bitwidth == 32)
4547 return OMPBuilder.getOrCreateRuntimeFunction(
4548 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4549 if (Bitwidth == 64)
4550 return OMPBuilder.getOrCreateRuntimeFunction(
4551 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4552 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4553}
4554
4555OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
4556 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4557 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
4558 assert(CLI->isValid() && "Requires a valid canonical loop");
4559 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4560 "Require dedicated allocate IP");
4562 "Require valid schedule type");
4563
4564 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4565 OMPScheduleType::ModifierOrdered;
4566
4567 // Set up the source location value for OpenMP runtime.
4569
4570 uint32_t SrcLocStrSize;
4571 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4572 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4573
4574 // Declare useful OpenMP runtime functions.
4575 Value *IV = CLI->getIndVar();
4576 Type *IVTy = IV->getType();
4577 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4578 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4579
4580 // Allocate space for computed loop bounds as expected by the "init" function.
4581 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4582 Type *I32Type = Type::getInt32Ty(M.getContext());
4583 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4584 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4585 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4586 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4587
4588 // At the end of the preheader, prepare for calling the "init" function by
4589 // storing the current loop bounds into the allocated space. A canonical loop
4590 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4591 // and produces an inclusive upper bound.
4592 BasicBlock *PreHeader = CLI->getPreheader();
4593 Builder.SetInsertPoint(PreHeader->getTerminator());
4594 Constant *One = ConstantInt::get(IVTy, 1);
4595 Builder.CreateStore(One, PLowerBound);
4596 Value *UpperBound = CLI->getTripCount();
4597 Builder.CreateStore(UpperBound, PUpperBound);
4598 Builder.CreateStore(One, PStride);
4599
4600 BasicBlock *Header = CLI->getHeader();
4601 BasicBlock *Exit = CLI->getExit();
4602 BasicBlock *Cond = CLI->getCond();
4603 BasicBlock *Latch = CLI->getLatch();
4604 InsertPointTy AfterIP = CLI->getAfterIP();
4605
4606 // The CLI will be "broken" in the code below, as the loop is no longer
4607 // a valid canonical loop.
4608
4609 if (!Chunk)
4610 Chunk = One;
4611
4612 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4613
4614 Constant *SchedulingType =
4615 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4616
4617 // Call the "init" function.
4618 Builder.CreateCall(DynamicInit,
4619 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4620 UpperBound, /* step */ One, Chunk});
4621
4622 // An outer loop around the existing one.
4623 BasicBlock *OuterCond = BasicBlock::Create(
4624 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4625 PreHeader->getParent());
4626 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4627 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4628 Value *Res =
4629 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4630 PLowerBound, PUpperBound, PStride});
4631 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4632 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4633 Value *LowerBound =
4634 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4635 Builder.CreateCondBr(MoreWork, Header, Exit);
4636
4637 // Change PHI-node in loop header to use outer cond rather than preheader,
4638 // and set IV to the LowerBound.
4639 Instruction *Phi = &Header->front();
4640 auto *PI = cast<PHINode>(Phi);
4641 PI->setIncomingBlock(0, OuterCond);
4642 PI->setIncomingValue(0, LowerBound);
4643
4644 // Then set the pre-header to jump to the OuterCond
4645 Instruction *Term = PreHeader->getTerminator();
4646 auto *Br = cast<BranchInst>(Term);
4647 Br->setSuccessor(0, OuterCond);
4648
4649 // Modify the inner condition:
4650 // * Use the UpperBound returned from the DynamicNext call.
4651 // * jump to the loop outer loop when done with one of the inner loops.
4652 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4653 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4655 auto *CI = cast<CmpInst>(Comp);
4656 CI->setOperand(1, UpperBound);
4657 // Redirect the inner exit to branch to outer condition.
4658 Instruction *Branch = &Cond->back();
4659 auto *BI = cast<BranchInst>(Branch);
4660 assert(BI->getSuccessor(1) == Exit);
4661 BI->setSuccessor(1, OuterCond);
4662
4663 // Call the "fini" function if "ordered" is present in wsloop directive.
4664 if (Ordered) {
4665 Builder.SetInsertPoint(&Latch->back());
4666 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4667 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4668 }
4669
4670 // Add the barrier if requested.
4671 if (NeedsBarrier) {
4672 Builder.SetInsertPoint(&Exit->back());
4673 createBarrier(LocationDescription(Builder.saveIP(), DL),
4674 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4675 /* CheckCancelFlag */ false);
4676 }
4677
4678 CLI->invalidate();
4679 return AfterIP;
4680}
4681
4682/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4683/// after this \p OldTarget will be orphaned.
4685 BasicBlock *NewTarget, DebugLoc DL) {
4686 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4687 redirectTo(Pred, NewTarget, DL);
4688}
4689
4690/// Determine which blocks in \p BBs are reachable from outside and remove the
4691/// ones that are not reachable from the function.
4693 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4694 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4695 for (Use &U : BB->uses()) {
4696 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4697 if (!UseInst)
4698 continue;
4699 if (BBsToErase.count(UseInst->getParent()))
4700 continue;
4701 return true;
4702 }
4703 return false;
4704 };
4705
4706 while (BBsToErase.remove_if(HasRemainingUses)) {
4707 // Try again if anything was removed.
4708 }
4709
4710 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4711 DeleteDeadBlocks(BBVec);
4712}
4713
4716 InsertPointTy ComputeIP) {
4717 assert(Loops.size() >= 1 && "At least one loop required");
4718 size_t NumLoops = Loops.size();
4719
4720 // Nothing to do if there is already just one loop.
4721 if (NumLoops == 1)
4722 return Loops.front();
4723
4724 CanonicalLoopInfo *Outermost = Loops.front();
4725 CanonicalLoopInfo *Innermost = Loops.back();
4726 BasicBlock *OrigPreheader = Outermost->getPreheader();
4727 BasicBlock *OrigAfter = Outermost->getAfter();
4728 Function *F = OrigPreheader->getParent();
4729
4730 // Loop control blocks that may become orphaned later.
4731 SmallVector<BasicBlock *, 12> OldControlBBs;
4732 OldControlBBs.reserve(6 * Loops.size());
4734 Loop->collectControlBlocks(OldControlBBs);
4735
4736 // Setup the IRBuilder for inserting the trip count computation.
4738 if (ComputeIP.isSet())
4739 Builder.restoreIP(ComputeIP);
4740 else
4741 Builder.restoreIP(Outermost->getPreheaderIP());
4742
4743 // Derive the collapsed' loop trip count.
4744 // TODO: Find common/largest indvar type.
4745 Value *CollapsedTripCount = nullptr;
4746 for (CanonicalLoopInfo *L : Loops) {
4747 assert(L->isValid() &&
4748 "All loops to collapse must be valid canonical loops");
4749 Value *OrigTripCount = L->getTripCount();
4750 if (!CollapsedTripCount) {
4751 CollapsedTripCount = OrigTripCount;
4752 continue;
4753 }
4754
4755 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4756 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4757 {}, /*HasNUW=*/true);
4758 }
4759
4760 // Create the collapsed loop control flow.
4761 CanonicalLoopInfo *Result =
4762 createLoopSkeleton(DL, CollapsedTripCount, F,
4763 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4764
4765 // Build the collapsed loop body code.
4766 // Start with deriving the input loop induction variables from the collapsed
4767 // one, using a divmod scheme. To preserve the original loops' order, the
4768 // innermost loop use the least significant bits.
4769 Builder.restoreIP(Result->getBodyIP());
4770
4771 Value *Leftover = Result->getIndVar();
4772 SmallVector<Value *> NewIndVars;
4773 NewIndVars.resize(NumLoops);
4774 for (int i = NumLoops - 1; i >= 1; --i) {
4775 Value *OrigTripCount = Loops[i]->getTripCount();
4776
4777 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4778 NewIndVars[i] = NewIndVar;
4779
4780 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4781 }
4782 // Outermost loop gets all the remaining bits.
4783 NewIndVars[0] = Leftover;
4784
4785 // Construct the loop body control flow.
4786 // We progressively construct the branch structure following in direction of
4787 // the control flow, from the leading in-between code, the loop nest body, the
4788 // trailing in-between code, and rejoining the collapsed loop's latch.
4789 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4790 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4791 // its predecessors as sources.
4792 BasicBlock *ContinueBlock = Result->getBody();
4793 BasicBlock *ContinuePred = nullptr;
4794 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4795 BasicBlock *NextSrc) {
4796 if (ContinueBlock)
4797 redirectTo(ContinueBlock, Dest, DL);
4798 else
4799 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4800
4801 ContinueBlock = nullptr;
4802 ContinuePred = NextSrc;
4803 };
4804
4805 // The code before the nested loop of each level.
4806 // Because we are sinking it into the nest, it will be executed more often
4807 // that the original loop. More sophisticated schemes could keep track of what
4808 // the in-between code is and instantiate it only once per thread.
4809 for (size_t i = 0; i < NumLoops - 1; ++i)
4810 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4811
4812 // Connect the loop nest body.
4813 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4814
4815 // The code after the nested loop at each level.
4816 for (size_t i = NumLoops - 1; i > 0; --i)
4817 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4818
4819 // Connect the finished loop to the collapsed loop latch.
4820 ContinueWith(Result->getLatch(), nullptr);
4821
4822 // Replace the input loops with the new collapsed loop.
4823 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4824 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4825
4826 // Replace the input loop indvars with the derived ones.
4827 for (size_t i = 0; i < NumLoops; ++i)
4828 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4829
4830 // Remove unused parts of the input loops.
4831 removeUnusedBlocksFromParent(OldControlBBs);
4832
4833 for (CanonicalLoopInfo *L : Loops)
4834 L->invalidate();
4835
4836#ifndef NDEBUG
4837 Result->assertOK();
4838#endif
4839 return Result;
4840}
4841
4842std::vector<CanonicalLoopInfo *>
4844 ArrayRef<Value *> TileSizes) {
4845 assert(TileSizes.size() == Loops.size() &&
4846 "Must pass as many tile sizes as there are loops");
4847 int NumLoops = Loops.size();
4848 assert(NumLoops >= 1 && "At least one loop to tile required");
4849
4850 CanonicalLoopInfo *OutermostLoop = Loops.front();
4851 CanonicalLoopInfo *InnermostLoop = Loops.back();
4852 Function *F = OutermostLoop->getBody()->getParent();
4853 BasicBlock *InnerEnter = InnermostLoop->getBody();
4854 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4855
4856 // Loop control blocks that may become orphaned later.
4857 SmallVector<BasicBlock *, 12> OldControlBBs;
4858 OldControlBBs.reserve(6 * Loops.size());
4860 Loop->collectControlBlocks(OldControlBBs);
4861
4862 // Collect original trip counts and induction variable to be accessible by
4863 // index. Also, the structure of the original loops is not preserved during
4864 // the construction of the tiled loops, so do it before we scavenge the BBs of
4865 // any original CanonicalLoopInfo.
4866 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4867 for (CanonicalLoopInfo *L : Loops) {
4868 assert(L->isValid() && "All input loops must be valid canonical loops");
4869 OrigTripCounts.push_back(L->getTripCount());
4870 OrigIndVars.push_back(L->getIndVar());
4871 }
4872
4873 // Collect the code between loop headers. These may contain SSA definitions
4874 // that are used in the loop nest body. To be usable with in the innermost
4875 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4876 // these instructions may be executed more often than before the tiling.
4877 // TODO: It would be sufficient to only sink them into body of the
4878 // corresponding tile loop.
4880 for (int i = 0; i < NumLoops - 1; ++i) {
4881 CanonicalLoopInfo *Surrounding = Loops[i];
4882 CanonicalLoopInfo *Nested = Loops[i + 1];
4883
4884 BasicBlock *EnterBB = Surrounding->getBody();
4885 BasicBlock *ExitBB = Nested->getHeader();
4886 InbetweenCode.emplace_back(EnterBB, ExitBB);
4887 }
4888
4889 // Compute the trip counts of the floor loops.
4891 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4892 SmallVector<Value *, 4> FloorCount, FloorRems;
4893 for (int i = 0; i < NumLoops; ++i) {
4894 Value *TileSize = TileSizes[i];
4895 Value *OrigTripCount = OrigTripCounts[i];
4896 Type *IVType = OrigTripCount->getType();
4897
4898 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
4899 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
4900
4901 // 0 if tripcount divides the tilesize, 1 otherwise.
4902 // 1 means we need an additional iteration for a partial tile.
4903 //
4904 // Unfortunately we cannot just use the roundup-formula
4905 // (tripcount + tilesize - 1)/tilesize
4906 // because the summation might overflow. We do not want introduce undefined
4907 // behavior when the untiled loop nest did not.
4908 Value *FloorTripOverflow =
4909 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
4910
4911 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
4912 FloorTripCount =
4913 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
4914 "omp_floor" + Twine(i) + ".tripcount", true);
4915
4916 // Remember some values for later use.
4917 FloorCount.push_back(FloorTripCount);
4918 FloorRems.push_back(FloorTripRem);
4919 }
4920
4921 // Generate the new loop nest, from the outermost to the innermost.
4922 std::vector<CanonicalLoopInfo *> Result;
4923 Result.reserve(NumLoops * 2);
4924
4925 // The basic block of the surrounding loop that enters the nest generated
4926 // loop.
4927 BasicBlock *Enter = OutermostLoop->getPreheader();
4928
4929 // The basic block of the surrounding loop where the inner code should
4930 // continue.
4931 BasicBlock *Continue = OutermostLoop->getAfter();
4932
4933 // Where the next loop basic block should be inserted.
4934 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
4935
4936 auto EmbeddNewLoop =
4937 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
4938 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
4939 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
4940 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
4941 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
4942 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
4943
4944 // Setup the position where the next embedded loop connects to this loop.
4945 Enter = EmbeddedLoop->getBody();
4946 Continue = EmbeddedLoop->getLatch();
4947 OutroInsertBefore = EmbeddedLoop->getLatch();
4948 return EmbeddedLoop;
4949 };
4950
4951 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
4952 const Twine &NameBase) {
4953 for (auto P : enumerate(TripCounts)) {
4954 CanonicalLoopInfo *EmbeddedLoop =
4955 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
4956 Result.push_back(EmbeddedLoop);
4957 }
4958 };
4959
4960 EmbeddNewLoops(FloorCount, "floor");
4961
4962 // Within the innermost floor loop, emit the code that computes the tile
4963 // sizes.
4965 SmallVector<Value *, 4> TileCounts;
4966 for (int i = 0; i < NumLoops; ++i) {
4967 CanonicalLoopInfo *FloorLoop = Result[i];
4968 Value *TileSize = TileSizes[i];
4969
4970 Value *FloorIsEpilogue =
4971 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
4972 Value *TileTripCount =
4973 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
4974
4975 TileCounts.push_back(TileTripCount);
4976 }
4977
4978 // Create the tile loops.
4979 EmbeddNewLoops(TileCounts, "tile");
4980
4981 // Insert the inbetween code into the body.
4982 BasicBlock *BodyEnter = Enter;
4983 BasicBlock *BodyEntered = nullptr;
4984 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
4985 BasicBlock *EnterBB = P.first;
4986 BasicBlock *ExitBB = P.second;
4987
4988 if (BodyEnter)
4989 redirectTo(BodyEnter, EnterBB, DL);
4990 else
4991 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
4992
4993 BodyEnter = nullptr;
4994 BodyEntered = ExitBB;
4995 }
4996
4997 // Append the original loop nest body into the generated loop nest body.
4998 if (BodyEnter)
4999 redirectTo(BodyEnter, InnerEnter, DL);
5000 else
5001 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5003
5004 // Replace the original induction variable with an induction variable computed
5005 // from the tile and floor induction variables.
5006 Builder.restoreIP(Result.back()->getBodyIP());
5007 for (int i = 0; i < NumLoops; ++i) {
5008 CanonicalLoopInfo *FloorLoop = Result[i];
5009 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5010 Value *OrigIndVar = OrigIndVars[i];
5011 Value *Size = TileSizes[i];
5012
5013 Value *Scale =
5014 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5015 Value *Shift =
5016 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5017 OrigIndVar->replaceAllUsesWith(Shift);
5018 }
5019
5020 // Remove unused parts of the original loops.
5021 removeUnusedBlocksFromParent(OldControlBBs);
5022
5023 for (CanonicalLoopInfo *L : Loops)
5024 L->invalidate();
5025
5026#ifndef NDEBUG
5027 for (CanonicalLoopInfo *GenL : Result)
5028 GenL->assertOK();
5029#endif
5030 return Result;
5031}
5032
5033/// Attach metadata \p Properties to the basic block described by \p BB. If the
5034/// basic block already has metadata, the basic block properties are appended.
5036 ArrayRef<Metadata *> Properties) {
5037 // Nothing to do if no property to attach.
5038 if (Properties.empty())
5039 return;
5040
5041 LLVMContext &Ctx = BB->getContext();
5042 SmallVector<Metadata *> NewProperties;
5043 NewProperties.push_back(nullptr);
5044
5045 // If the basic block already has metadata, prepend it to the new metadata.
5046 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5047 if (Existing)
5048 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5049
5050 append_range(NewProperties, Properties);
5051 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5052 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5053
5054 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5055}
5056
5057/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5058/// loop already has metadata, the loop properties are appended.
5060 ArrayRef<Metadata *> Properties) {
5061 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5062
5063 // Attach metadata to the loop's latch
5064 BasicBlock *Latch = Loop->getLatch();
5065 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5066 addBasicBlockMetadata(Latch, Properties);
5067}
5068
5069/// Attach llvm.access.group metadata to the memref instructions of \p Block
5070static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5071 LoopInfo &LI) {
5072 for (Instruction &I : *Block) {
5073 if (I.mayReadOrWriteMemory()) {
5074 // TODO: This instruction may already have access group from
5075 // other pragmas e.g. #pragma clang loop vectorize. Append
5076 // so that the existing metadata is not overwritten.
5077 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5078 }
5079 }
5080}
5081
5085 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5086 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5087}
5088
5092 Loop, {
5093 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5094 });
5095}
5096
5097void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5098 Value *IfCond, ValueToValueMapTy &VMap,
5099 const Twine &NamePrefix) {
5100 Function *F = CanonicalLoop->getFunction();
5101
5102 // Define where if branch should be inserted
5103 Instruction *SplitBefore;
5104 if (Instruction::classof(IfCond)) {
5105 SplitBefore = dyn_cast<Instruction>(IfCond);
5106 } else {
5107 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5108 }
5109
5110 // TODO: We should not rely on pass manager. Currently we use pass manager
5111 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5112 // object. We should have a method which returns all blocks between
5113 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5115 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5116 FAM.registerPass([]() { return LoopAnalysis(); });
5117 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5118
5119 // Get the loop which needs to be cloned
5120 LoopAnalysis LIA;
5121 LoopInfo &&LI = LIA.run(*F, FAM);
5122 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5123
5124 // Create additional blocks for the if statement
5125 BasicBlock *Head = SplitBefore->getParent();
5126 Instruction *HeadOldTerm = Head->getTerminator();
5127 llvm::LLVMContext &C = Head->getContext();
5129 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5131 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5132
5133 // Create if condition branch.
5134 Builder.SetInsertPoint(HeadOldTerm);
5135 Instruction *BrInstr =
5136 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5137 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5138 // Then block contains branch to omp loop which needs to be vectorized
5139 spliceBB(IP, ThenBlock, false);
5140 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5141
5142 Builder.SetInsertPoint(ElseBlock);
5143
5144 // Clone loop for the else branch
5146
5147 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5148 for (BasicBlock *Block : L->getBlocks()) {
5149 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5150 NewBB->moveBefore(CanonicalLoop->getExit());
5151 VMap[Block] = NewBB;
5152 NewBlocks.push_back(NewBB);
5153 }
5154 remapInstructionsInBlocks(NewBlocks, VMap);
5155 Builder.CreateBr(NewBlocks.front());
5156}
5157
5158unsigned
5160 const StringMap<bool> &Features) {
5161 if (TargetTriple.isX86()) {
5162 if (Features.lookup("avx512f"))
5163 return 512;
5164 else if (Features.lookup("avx"))
5165 return 256;
5166 return 128;
5167 }
5168 if (TargetTriple.isPPC())
5169 return 128;
5170 if (TargetTriple.isWasm())
5171 return 128;
5172 return 0;
5173}
5174
5176 MapVector<Value *, Value *> AlignedVars,
5177 Value *IfCond, OrderKind Order,
5178 ConstantInt *Simdlen, ConstantInt *Safelen) {
5180
5181 Function *F = CanonicalLoop->getFunction();
5182
5183 // TODO: We should not rely on pass manager. Currently we use pass manager
5184 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5185 // object. We should have a method which returns all blocks between
5186 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5188 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5189 FAM.registerPass([]() { return LoopAnalysis(); });
5190 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5191
5192 LoopAnalysis LIA;
5193 LoopInfo &&LI = LIA.run(*F, FAM);
5194
5195 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5196 if (AlignedVars.size()) {
5198 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
5199 for (auto &AlignedItem : AlignedVars) {
5200 Value *AlignedPtr = AlignedItem.first;
5201 Value *Alignment = AlignedItem.second;
5202 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5203 AlignedPtr, Alignment);
5204 }
5205 Builder.restoreIP(IP);
5206 }
5207
5208 if (IfCond) {
5209 ValueToValueMapTy VMap;
5210 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5211 // Add metadata to the cloned loop which disables vectorization
5212 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5213 assert(MappedLatch &&
5214 "Cannot find value which corresponds to original loop latch");
5215 assert(isa<BasicBlock>(MappedLatch) &&
5216 "Cannot cast mapped latch block value to BasicBlock");
5217 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5218 ConstantAsMetadata *BoolConst =
5221 NewLatchBlock,
5222 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5223 BoolConst})});
5224 }
5225
5226 SmallSet<BasicBlock *, 8> Reachable;
5227
5228 // Get the basic blocks from the loop in which memref instructions
5229 // can be found.
5230 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5231 // preferably without running any passes.
5232 for (BasicBlock *Block : L->getBlocks()) {
5233 if (Block == CanonicalLoop->getCond() ||
5234 Block == CanonicalLoop->getHeader())
5235 continue;
5236 Reachable.insert(Block);
5237 }
5238
5239 SmallVector<Metadata *> LoopMDList;
5240
5241 // In presence of finite 'safelen', it may be unsafe to mark all
5242 // the memory instructions parallel, because loop-carried
5243 // dependences of 'safelen' iterations are possible.
5244 // If clause order(concurrent) is specified then the memory instructions
5245 // are marked parallel even if 'safelen' is finite.
5246 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5247 // Add access group metadata to memory-access instructions.
5248 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5249 for (BasicBlock *BB : Reachable)
5250 addSimdMetadata(BB, AccessGroup, LI);
5251 // TODO: If the loop has existing parallel access metadata, have
5252 // to combine two lists.
5253 LoopMDList.push_back(MDNode::get(
5254 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5255 }
5256
5257 // Use the above access group metadata to create loop level
5258 // metadata, which should be distinct for each loop.
5259 ConstantAsMetadata *BoolConst =
5261 LoopMDList.push_back(MDNode::get(
5262 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5263
5264 if (Simdlen || Safelen) {
5265 // If both simdlen and safelen clauses are specified, the value of the
5266 // simdlen parameter must be less than or equal to the value of the safelen
5267 // parameter. Therefore, use safelen only in the absence of simdlen.
5268 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5269 LoopMDList.push_back(
5270 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5271 ConstantAsMetadata::get(VectorizeWidth)}));
5272 }
5273
5274 addLoopMetadata(CanonicalLoop, LoopMDList);
5275}
5276
5277/// Create the TargetMachine object to query the backend for optimization
5278/// preferences.
5279///
5280/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5281/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5282/// needed for the LLVM pass pipline. We use some default options to avoid
5283/// having to pass too many settings from the frontend that probably do not
5284/// matter.
5285///
5286/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5287/// method. If we are going to use TargetMachine for more purposes, especially
5288/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5289/// might become be worth requiring front-ends to pass on their TargetMachine,
5290/// or at least cache it between methods. Note that while fontends such as Clang
5291/// have just a single main TargetMachine per translation unit, "target-cpu" and
5292/// "target-features" that determine the TargetMachine are per-function and can
5293/// be overrided using __attribute__((target("OPTIONS"))).
5294static std::unique_ptr<TargetMachine>
5296 Module *M = F->getParent();
5297
5298 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5299 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5300 const std::string &Triple = M->getTargetTriple();
5301
5302 std::string Error;
5304 if (!TheTarget)
5305 return {};
5306
5308 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5309 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5310 /*CodeModel=*/std::nullopt, OptLevel));
5311}
5312
5313/// Heuristically determine the best-performant unroll factor for \p CLI. This
5314/// depends on the target processor. We are re-using the same heuristics as the
5315/// LoopUnrollPass.
5317 Function *F = CLI->getFunction();
5318
5319 // Assume the user requests the most aggressive unrolling, even if the rest of
5320 // the code is optimized using a lower setting.
5322 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5323
5325 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5326 FAM.registerPass([]() { return AssumptionAnalysis(); });
5327 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5328 FAM.registerPass([]() { return LoopAnalysis(); });
5329 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5330 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5331 TargetIRAnalysis TIRA;
5332 if (TM)
5333 TIRA = TargetIRAnalysis(
5334 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5335 FAM.registerPass([&]() { return TIRA; });
5336
5337 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5339 ScalarEvolution &&SE = SEA.run(*F, FAM);
5341 DominatorTree &&DT = DTA.run(*F, FAM);
5342 LoopAnalysis LIA;
5343 LoopInfo &&LI = LIA.run(*F, FAM);
5345 AssumptionCache &&AC = ACT.run(*F, FAM);
5347
5348 Loop *L = LI.getLoopFor(CLI->getHeader());
5349 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5350
5353 /*BlockFrequencyInfo=*/nullptr,
5354 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5355 /*UserThreshold=*/std::nullopt,
5356 /*UserCount=*/std::nullopt,
5357 /*UserAllowPartial=*/true,
5358 /*UserAllowRuntime=*/true,
5359 /*UserUpperBound=*/std::nullopt,
5360 /*UserFullUnrollMaxCount=*/std::nullopt);
5361
5362 UP.Force = true;
5363
5364 // Account for additional optimizations taking place before the LoopUnrollPass
5365 // would unroll the loop.
5368
5369 // Use normal unroll factors even if the rest of the code is optimized for
5370 // size.
5373
5374 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5375 << " Threshold=" << UP.Threshold << "\n"
5376 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5377 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5378 << " PartialOptSizeThreshold="
5379 << UP.PartialOptSizeThreshold << "\n");
5380
5381 // Disable peeling.
5384 /*UserAllowPeeling=*/false,
5385 /*UserAllowProfileBasedPeeling=*/false,
5386 /*UnrollingSpecficValues=*/false);
5387
5389 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5390
5391 // Assume that reads and writes to stack variables can be eliminated by
5392 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5393 // size.
5394 for (BasicBlock *BB : L->blocks()) {
5395 for (Instruction &I : *BB) {
5396 Value *Ptr;
5397 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5398 Ptr = Load->getPointerOperand();
5399 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5400 Ptr = Store->getPointerOperand();
5401 } else
5402 continue;
5403
5404 Ptr = Ptr->stripPointerCasts();
5405
5406 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5407 if (Alloca->getParent() == &F->getEntryBlock())
5408 EphValues.insert(&I);
5409 }
5410 }
5411 }
5412
5413 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5414
5415 // Loop is not unrollable if the loop contains certain instructions.
5416 if (!UCE.canUnroll()) {
5417 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5418 return 1;
5419 }
5420
5421 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5422 << "\n");
5423
5424 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5425 // be able to use it.
5426 int TripCount = 0;
5427 int MaxTripCount = 0;
5428 bool MaxOrZero = false;
5429 unsigned TripMultiple = 0;
5430
5431 bool UseUpperBound = false;
5432 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5433 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5434 UseUpperBound);
5435 unsigned Factor = UP.Count;
5436 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5437
5438 // This function returns 1 to signal to not unroll a loop.
5439 if (Factor == 0)
5440 return 1;
5441 return Factor;
5442}
5443
5445 int32_t Factor,
5446 CanonicalLoopInfo **UnrolledCLI) {
5447 assert(Factor >= 0 && "Unroll factor must not be negative");
5448
5449 Function *F = Loop->getFunction();
5450 LLVMContext &Ctx = F->getContext();
5451
5452 // If the unrolled loop is not used for another loop-associated directive, it
5453 // is sufficient to add metadata for the LoopUnrollPass.
5454 if (!UnrolledCLI) {
5455 SmallVector<Metadata *, 2> LoopMetadata;
5456 LoopMetadata.push_back(
5457 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5458
5459 if (Factor >= 1) {
5461 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5462 LoopMetadata.push_back(MDNode::get(
5463 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5464 }
5465
5466 addLoopMetadata(Loop, LoopMetadata);
5467 return;
5468 }
5469
5470 // Heuristically determine the unroll factor.
5471 if (Factor == 0)
5473
5474 // No change required with unroll factor 1.
5475 if (Factor == 1) {
5476 *UnrolledCLI = Loop;
5477 return;
5478 }
5479
5480 assert(Factor >= 2 &&
5481 "unrolling only makes sense with a factor of 2 or larger");
5482
5483 Type *IndVarTy = Loop->getIndVarType();
5484
5485 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5486 // unroll the inner loop.
5487 Value *FactorVal =
5488 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5489 /*isSigned=*/false));
5490 std::vector<CanonicalLoopInfo *> LoopNest =
5491 tileLoops(DL, {Loop}, {FactorVal});
5492 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5493 *UnrolledCLI = LoopNest[0];
5494 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5495
5496 // LoopUnrollPass can only fully unroll loops with constant trip count.
5497 // Unroll by the unroll factor with a fallback epilog for the remainder
5498 // iterations if necessary.
5500 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5502 InnerLoop,
5503 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5505 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5506
5507#ifndef NDEBUG
5508 (*UnrolledCLI)->assertOK();
5509#endif
5510}
5511
5514 llvm::Value *BufSize, llvm::Value *CpyBuf,
5515 llvm::Value *CpyFn, llvm::Value *DidIt) {
5516 if (!updateToLocation(Loc))
5517 return Loc.IP;
5518
5519 uint32_t SrcLocStrSize;
5520 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5521 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5522 Value *ThreadId = getOrCreateThreadID(Ident);
5523
5524 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5525
5526 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5527
5528 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5529 Builder.CreateCall(Fn, Args);
5530
5531 return Builder.saveIP();
5532}
5533
5535 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5536 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5538
5539 if (!updateToLocation(Loc))
5540 return Loc.IP;
5541
5542 // If needed allocate and initialize `DidIt` with 0.
5543 // DidIt: flag variable: 1=single thread; 0=not single thread.
5544 llvm::Value *DidIt = nullptr;
5545 if (!CPVars.empty()) {
5548 }
5549
5550 Directive OMPD = Directive::OMPD_single;
5551 uint32_t SrcLocStrSize;
5552 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5553 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5554 Value *ThreadId = getOrCreateThreadID(Ident);
5555 Value *Args[] = {Ident, ThreadId};
5556
5557 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5558 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5559
5560 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5561 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5562
5563 auto FiniCBWrapper = [&](InsertPointTy IP) {
5564 FiniCB(IP);
5565
5566 // The thread that executes the single region must set `DidIt` to 1.
5567 // This is used by __kmpc_copyprivate, to know if the caller is the
5568 // single thread or not.
5569 if (DidIt)
5571 };
5572
5573 // generates the following:
5574 // if (__kmpc_single()) {
5575 // .... single region ...
5576 // __kmpc_end_single
5577 // }
5578 // __kmpc_copyprivate
5579 // __kmpc_barrier
5580
5581 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5582 /*Conditional*/ true,
5583 /*hasFinalize*/ true);
5584
5585 if (DidIt) {
5586 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5587 // NOTE BufSize is currently unused, so just pass 0.
5589 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5590 CPFuncs[I], DidIt);
5591 // NOTE __kmpc_copyprivate already inserts a barrier
5592 } else if (!IsNowait)
5594 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5595 /* CheckCancelFlag */ false);
5596 return Builder.saveIP();
5597}
5598
5600 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5601 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5602
5603 if (!updateToLocation(Loc))
5604 return Loc.IP;
5605
5606 Directive OMPD = Directive::OMPD_critical;
5607 uint32_t SrcLocStrSize;
5608 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5609 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5610 Value *ThreadId = getOrCreateThreadID(Ident);
5611 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5612 Value *Args[] = {Ident, ThreadId, LockVar};
5613
5614 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5615 Function *RTFn = nullptr;
5616 if (HintInst) {
5617 // Add Hint to entry Args and create call
5618 EnterArgs.push_back(HintInst);
5619 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5620 } else {
5621 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5622 }
5623 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5624
5625 Function *ExitRTLFn =
5626 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5627 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5628
5629 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5630 /*Conditional*/ false, /*hasFinalize*/ true);
5631}
5632
5635 InsertPointTy AllocaIP, unsigned NumLoops,
5636 ArrayRef<llvm::Value *> StoreValues,
5637 const Twine &Name, bool IsDependSource) {
5638 assert(
5639 llvm::all_of(StoreValues,
5640 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5641 "OpenMP runtime requires depend vec with i64 type");
5642
5643 if (!updateToLocation(Loc))
5644 return Loc.IP;
5645
5646 // Allocate space for vector and generate alloc instruction.
5647 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5648 Builder.restoreIP(AllocaIP);
5649 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5650 ArgsBase->setAlignment(Align(8));
5651 Builder.restoreIP(Loc.IP);
5652
5653 // Store the index value with offset in depend vector.
5654 for (unsigned I = 0; I < NumLoops; ++I) {
5655 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5656 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5657 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5658 STInst->setAlignment(Align(8));
5659 }
5660
5661 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5662 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5663
5664 uint32_t SrcLocStrSize;
5665 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5666 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5667 Value *ThreadId = getOrCreateThreadID(Ident);
5668 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5669
5670 Function *RTLFn = nullptr;
5671 if (IsDependSource)
5672 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5673 else
5674 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5675 Builder.CreateCall(RTLFn, Args);
5676
5677 return Builder.saveIP();
5678}
5679
5681 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5682 FinalizeCallbackTy FiniCB, bool IsThreads) {
5683 if (!updateToLocation(Loc))
5684 return Loc.IP;
5685
5686 Directive OMPD = Directive::OMPD_ordered;
5687 Instruction *EntryCall = nullptr;
5688 Instruction *ExitCall = nullptr;
5689
5690 if (IsThreads) {
5691 uint32_t SrcLocStrSize;
5692 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5693 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5694 Value *ThreadId = getOrCreateThreadID(Ident);
5695 Value *Args[] = {Ident, ThreadId};
5696
5697 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5698 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5699
5700 Function *ExitRTLFn =
5701 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5702 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5703 }
5704
5705 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5706 /*Conditional*/ false, /*hasFinalize*/ true);
5707}
5708
5709OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5710 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5711 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5712 bool HasFinalize, bool IsCancellable) {
5713
5714 if (HasFinalize)
5715 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5716
5717 // Create inlined region's entry and body blocks, in preparation
5718 // for conditional creation
5719 BasicBlock *EntryBB = Builder.GetInsertBlock();
5720 Instruction *SplitPos = EntryBB->getTerminator();
5721 if (!isa_and_nonnull<BranchInst>(SplitPos))
5722 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5723 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5724 BasicBlock *FiniBB =
5725 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5726
5728 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5729
5730 // generate body
5731 BodyGenCB(/* AllocaIP */ InsertPointTy(),
5732 /* CodeGenIP */ Builder.saveIP());
5733
5734 // emit exit call and do any needed finalization.
5735 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5736 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5737 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5738 "Unexpected control flow graph state!!");
5739 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5740 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5741 "Unexpected Control Flow State!");
5743
5744 // If we are skipping the region of a non conditional, remove the exit
5745 // block, and clear the builder's insertion point.
5746 assert(SplitPos->getParent() == ExitBB &&
5747 "Unexpected Insertion point location!");
5748 auto merged = MergeBlockIntoPredecessor(ExitBB);
5749 BasicBlock *ExitPredBB = SplitPos->getParent();
5750 auto InsertBB = merged ? ExitPredBB : ExitBB;
5751 if (!isa_and_nonnull<BranchInst>(SplitPos))
5752 SplitPos->eraseFromParent();
5753 Builder.SetInsertPoint(InsertBB);
5754
5755 return Builder.saveIP();
5756}
5757
5758OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5759 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5760 // if nothing to do, Return current insertion point.
5761 if (!Conditional || !EntryCall)
5762 return Builder.saveIP();
5763
5764 BasicBlock *EntryBB = Builder.GetInsertBlock();
5765 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5766 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5767 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5768
5769 // Emit thenBB and set the Builder's insertion point there for
5770 // body generation next. Place the block after the current block.
5771 Function *CurFn = EntryBB->getParent();
5772 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5773
5774 // Move Entry branch to end of ThenBB, and replace with conditional
5775 // branch (If-stmt)
5776 Instruction *EntryBBTI = EntryBB->getTerminator();
5777 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5778 EntryBBTI->removeFromParent();
5780 Builder.Insert(EntryBBTI);
5781 UI->eraseFromParent();
5783
5784 // return an insertion point to ExitBB.
5785 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5786}
5787
5788OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
5789 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5790 bool HasFinalize) {
5791
5792 Builder.restoreIP(FinIP);
5793
5794 // If there is finalization to do, emit it before the exit call
5795 if (HasFinalize) {
5796 assert(!FinalizationStack.empty() &&
5797 "Unexpected finalization stack state!");
5798
5799 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5800 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5801
5802 Fi.FiniCB(FinIP);
5803
5804 BasicBlock *FiniBB = FinIP.getBlock();
5805 Instruction *FiniBBTI = FiniBB->getTerminator();
5806
5807 // set Builder IP for call creation
5808 Builder.SetInsertPoint(FiniBBTI);
5809 }
5810
5811 if (!ExitCall)
5812 return Builder.saveIP();
5813
5814 // place the Exitcall as last instruction before Finalization block terminator
5815 ExitCall->removeFromParent();
5816 Builder.Insert(ExitCall);
5817
5818 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5819 ExitCall->getIterator());
5820}
5821
5823 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5824 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5825 if (!IP.isSet())
5826 return IP;
5827
5829
5830 // creates the following CFG structure
5831 // OMP_Entry : (MasterAddr != PrivateAddr)?
5832 // F T
5833 // | \
5834 // | copin.not.master
5835 // | /
5836 // v /
5837 // copyin.not.master.end
5838 // |
5839 // v
5840 // OMP.Entry.Next
5841
5842 BasicBlock *OMP_Entry = IP.getBlock();
5843 Function *CurFn = OMP_Entry->getParent();
5844 BasicBlock *CopyBegin =
5845 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5846 BasicBlock *CopyEnd = nullptr;
5847
5848 // If entry block is terminated, split to preserve the branch to following
5849 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5850 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5851 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5852 "copyin.not.master.end");
5853 OMP_Entry->getTerminator()->eraseFromParent();
5854 } else {
5855 CopyEnd =
5856 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5857 }
5858
5859 Builder.SetInsertPoint(OMP_Entry);
5860 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5861 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5862 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5863 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5864
5865 Builder.SetInsertPoint(CopyBegin);
5866 if (BranchtoEnd)
5868
5869 return Builder.saveIP();
5870}
5871
5873 Value *Size, Value *Allocator,
5874 std::string Name) {
5876 updateToLocation(Loc);
5877
5878 uint32_t SrcLocStrSize;
5879 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5880 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5881 Value *ThreadId = getOrCreateThreadID(Ident);
5882 Value *Args[] = {ThreadId, Size, Allocator};
5883
5884 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
5885
5886 return Builder.CreateCall(Fn, Args, Name);
5887}
5888
5890 Value *Addr, Value *Allocator,
5891 std::string Name) {
5893 updateToLocation(Loc);
5894
5895 uint32_t SrcLocStrSize;
5896 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5897 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5898 Value *ThreadId = getOrCreateThreadID(Ident);
5899 Value *Args[] = {ThreadId, Addr, Allocator};
5900 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
5901 return Builder.CreateCall(Fn, Args, Name);
5902}
5903
5905 const LocationDescription &Loc, Value *InteropVar,
5906 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
5907 Value *DependenceAddress, bool HaveNowaitClause) {
5909 updateToLocation(Loc);
5910
5911 uint32_t SrcLocStrSize;
5912 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5913 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5914 Value *ThreadId = getOrCreateThreadID(Ident);
5915 if (Device == nullptr)
5917 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
5918 if (NumDependences == nullptr) {
5919 NumDependences = ConstantInt::get(Int32, 0);
5920 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5921 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5922 }
5923 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5924 Value *Args[] = {
5925 Ident, ThreadId, InteropVar, InteropTypeVal,
5926 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
5927
5928 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
5929
5930 return Builder.CreateCall(Fn, Args);
5931}
5932
5934 const LocationDescription &Loc, Value *InteropVar, Value *Device,
5935 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
5937 updateToLocation(Loc);
5938
5939 uint32_t SrcLocStrSize;
5940 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5941 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5942 Value *ThreadId = getOrCreateThreadID(Ident);
5943 if (Device == nullptr)
5945 if (NumDependences == nullptr) {
5946 NumDependences = ConstantInt::get(Int32, 0);
5947 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5948 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5949 }
5950 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5951 Value *Args[] = {
5952 Ident, ThreadId, InteropVar, Device,
5953 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5954
5955 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
5956
5957 return Builder.CreateCall(Fn, Args);
5958}
5959
5961 Value *InteropVar, Value *Device,
5962 Value *NumDependences,
5963 Value *DependenceAddress,
5964 bool HaveNowaitClause) {
5966 updateToLocation(Loc);
5967 uint32_t SrcLocStrSize;
5968 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5969 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5970 Value *ThreadId = getOrCreateThreadID(Ident);
5971 if (Device == nullptr)
5973 if (NumDependences == nullptr) {
5974 NumDependences = ConstantInt::get(Int32, 0);
5975 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5976 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5977 }
5978 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5979 Value *Args[] = {
5980 Ident, ThreadId, InteropVar, Device,
5981 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5982
5983 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
5984
5985 return Builder.CreateCall(Fn, Args);
5986}
5987
5989 const LocationDescription &Loc, llvm::Value *Pointer,
5992 updateToLocation(Loc);
5993
5994 uint32_t SrcLocStrSize;
5995 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5996 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5997 Value *ThreadId = getOrCreateThreadID(Ident);
5998 Constant *ThreadPrivateCache =
5999 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6000 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6001
6002 Function *Fn =
6003 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6004
6005 return Builder.CreateCall(Fn, Args);
6006}
6007
6010 int32_t MinThreadsVal, int32_t MaxThreadsVal,
6011 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
6012 if (!updateToLocation(Loc))
6013 return Loc.IP;
6014
6015 uint32_t SrcLocStrSize;
6016 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6017 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6018 Constant *IsSPMDVal = ConstantInt::getSigned(
6020 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
6021 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6022 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6023
6024 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6025 Function *Kernel = DebugKernelWrapper;
6026
6027 // We need to strip the debug prefix to get the correct kernel name.
6028 StringRef KernelName = Kernel->getName();
6029 const std::string DebugPrefix = "_debug__";
6030 if (KernelName.ends_with(DebugPrefix)) {
6031 KernelName = KernelName.drop_back(DebugPrefix.length());
6032 Kernel = M.getFunction(KernelName);
6033 assert(Kernel && "Expected the real kernel to exist");
6034 }
6035
6036 // Manifest the launch configuration in the metadata matching the kernel
6037 // environment.
6038 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
6039 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
6040
6041 // For max values, < 0 means unset, == 0 means set but unknown.
6042 if (MaxThreadsVal < 0)
6043 MaxThreadsVal = std::max(
6044 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
6045
6046 if (MaxThreadsVal > 0)
6047 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
6048
6049 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
6051 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
6052 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
6053 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6054 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6055
6057 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6058 const DataLayout &DL = Fn->getDataLayout();
6059
6060 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6061 Constant *DynamicEnvironmentInitializer =
6062 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6063 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6064 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6065 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6066 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6067 DL.getDefaultGlobalsAddressSpace());
6068 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6069
6070 Constant *DynamicEnvironment =
6071 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6072 ? DynamicEnvironmentGV
6073 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6074 DynamicEnvironmentPtr);
6075
6076 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6077 ConfigurationEnvironment, {
6078 UseGenericStateMachineVal,
6079 MayUseNestedParallelismVal,
6080 IsSPMDVal,
6081 MinThreads,
6082 MaxThreads,
6083 MinTeams,
6084 MaxTeams,
6085 ReductionDataSize,
6086 ReductionBufferLength,
6087 });
6088 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6089 KernelEnvironment, {
6090 ConfigurationEnvironmentInitializer,
6091 Ident,
6092 DynamicEnvironment,
6093 });
6094 std::string KernelEnvironmentName =
6095 (KernelName + "_kernel_environment").str();
6096 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6097 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6098 KernelEnvironmentInitializer, KernelEnvironmentName,
6099 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6100 DL.getDefaultGlobalsAddressSpace());
6101 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6102
6103 Constant *KernelEnvironment =
6104 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6105 ? KernelEnvironmentGV
6106 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6107 KernelEnvironmentPtr);
6108 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6109 CallInst *ThreadKind =
6110 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6111
6112 Value *ExecUserCode = Builder.CreateICmpEQ(
6113 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6114 "exec_user_code");
6115
6116 // ThreadKind = __kmpc_target_init(...)
6117 // if (ThreadKind == -1)
6118 // user_code
6119 // else
6120 // return;
6121
6122 auto *UI = Builder.CreateUnreachable();
6123 BasicBlock *CheckBB = UI->getParent();
6124 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6125
6126 BasicBlock *WorkerExitBB = BasicBlock::Create(
6127 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6128 Builder.SetInsertPoint(WorkerExitBB);
6130
6131 auto *CheckBBTI = CheckBB->getTerminator();
6132 Builder.SetInsertPoint(CheckBBTI);
6133 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6134
6135 CheckBBTI->eraseFromParent();
6136 UI->eraseFromParent();
6137
6138 // Continue in the "user_code" block, see diagram above and in
6139 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6140 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6141}
6142
6144 int32_t TeamsReductionDataSize,
6145 int32_t TeamsReductionBufferLength) {
6146 if (!updateToLocation(Loc))
6147 return;
6148
6150 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6151
6152 Builder.CreateCall(Fn, {});
6153
6154 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6155 return;
6156
6158 // We need to strip the debug prefix to get the correct kernel name.
6159 StringRef KernelName = Kernel->getName();
6160 const std::string DebugPrefix = "_debug__";
6161 if (KernelName.ends_with(DebugPrefix))
6162 KernelName = KernelName.drop_back(DebugPrefix.length());
6163 auto *KernelEnvironmentGV =
6164 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6165 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6166 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6167 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6168 KernelEnvironmentInitializer,
6169 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6170 NewInitializer = ConstantFoldInsertValueInstruction(
6171 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6172 {0, 8});
6173 KernelEnvironmentGV->setInitializer(NewInitializer);
6174}
6175
6177 Module &M = *Kernel.getParent();
6178 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6179 for (auto *Op : MD->operands()) {
6180 if (Op->getNumOperands() != 3)
6181 continue;
6182 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6183 if (!KernelOp || KernelOp->getValue() != &Kernel)
6184 continue;
6185 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6186 if (!Prop || Prop->getString() != Name)
6187 continue;
6188 return Op;
6189 }
6190 return nullptr;
6191}
6192
6194 bool Min) {
6195 // Update the "maxntidx" metadata for NVIDIA, or add it.
6196 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6197 if (ExistingOp) {
6198 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6199 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6200 ExistingOp->replaceOperandWith(
6201 2, ConstantAsMetadata::get(ConstantInt::get(
6202 OldVal->getValue()->getType(),
6203 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6204 } else {
6205 LLVMContext &Ctx = Kernel.getContext();
6207 MDString::get(Ctx, Name),
6209 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6210 // Append metadata to nvvm.annotations
6211 Module &M = *Kernel.getParent();
6212 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6213 MD->addOperand(MDNode::get(Ctx, MDVals));
6214 }
6215}
6216
6217std::pair<int32_t, int32_t>
6219 int32_t ThreadLimit =
6220 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6221
6222 if (T.isAMDGPU()) {
6223 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6224 if (!Attr.isValid() || !Attr.isStringAttribute())
6225 return {0, ThreadLimit};
6226 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6227 int32_t LB, UB;
6228 if (!llvm::to_integer(UBStr, UB, 10))
6229 return {0, ThreadLimit};
6230 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6231 if (!llvm::to_integer(LBStr, LB, 10))
6232 return {0, UB};
6233 return {LB, UB};
6234 }
6235
6236 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6237 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6238 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6239 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6240 }
6241 return {0, ThreadLimit};
6242}
6243
6245 Function &Kernel, int32_t LB,
6246 int32_t UB) {
6247 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6248
6249 if (T.isAMDGPU()) {
6250 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6251 llvm::utostr(LB) + "," + llvm::utostr(UB));
6252 return;
6253 }
6254
6255 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6256}
6257
6258std::pair<int32_t, int32_t>
6260 // TODO: Read from backend annotations if available.
6261 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6262}
6263
6265 int32_t LB, int32_t UB) {
6266 if (T.isNVPTX())
6267 if (UB > 0)
6268 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6269 if (T.isAMDGPU())
6270 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6271
6272 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6273}
6274
6275void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6276 Function *OutlinedFn) {
6277 if (Config.isTargetDevice()) {
6279 // TODO: Determine if DSO local can be set to true.
6280 OutlinedFn->setDSOLocal(false);
6282 if (T.isAMDGCN())
6284 }
6285}
6286
6287Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6288 StringRef EntryFnIDName) {
6289 if (Config.isTargetDevice()) {
6290 assert(OutlinedFn && "The outlined function must exist if embedded");
6291 return OutlinedFn;
6292 }
6293
6294 return new GlobalVariable(
6295 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6296 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6297}
6298
6299Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6300 StringRef EntryFnName) {
6301 if (OutlinedFn)
6302 return OutlinedFn;
6303
6304 assert(!M.getGlobalVariable(EntryFnName, true) &&
6305 "Named kernel already exists?");
6306 return new GlobalVariable(
6307 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6308 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6309}
6310
6312 TargetRegionEntryInfo &EntryInfo,
6313 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6314 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6315
6316 SmallString<64> EntryFnName;
6317 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6318
6320 ? GenerateFunctionCallback(EntryFnName)
6321 : nullptr;
6322
6323 // If this target outline function is not an offload entry, we don't need to
6324 // register it. This may be in the case of a false if clause, or if there are
6325 // no OpenMP targets.
6326 if (!IsOffloadEntry)
6327 return;
6328
6329 std::string EntryFnIDName =
6331 ? std::string(EntryFnName)
6332 : createPlatformSpecificName({EntryFnName, "region_id"});
6333
6334 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6335 EntryFnName, EntryFnIDName);
6336}
6337
6339 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6340 StringRef EntryFnName, StringRef EntryFnIDName) {
6341 if (OutlinedFn)
6342 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6343 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6344 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6346 EntryInfo, EntryAddr, OutlinedFnID,
6348 return OutlinedFnID;
6349}
6350
6352 const LocationDescription &Loc, InsertPointTy AllocaIP,
6353 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6354 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6355 omp::RuntimeFunction *MapperFunc,
6356 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
6357 BodyGenCB,
6358 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6359 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6360 if (!updateToLocation(Loc))
6361 return InsertPointTy();
6362
6363 // Disable TargetData CodeGen on Device pass.
6364 if (Config.IsTargetDevice.value_or(false)) {
6365 if (BodyGenCB)
6367 return Builder.saveIP();
6368 }
6369
6370 Builder.restoreIP(CodeGenIP);
6371 bool IsStandAlone = !BodyGenCB;
6372 MapInfosTy *MapInfo;
6373 // Generate the code for the opening of the data environment. Capture all the
6374 // arguments of the runtime call by reference because they are used in the
6375 // closing of the region.
6376 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6377 MapInfo = &GenMapInfoCB(Builder.saveIP());
6378 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6379 /*IsNonContiguous=*/true, DeviceAddrCB,
6380 CustomMapperCB);
6381
6382 TargetDataRTArgs RTArgs;
6384
6385 // Emit the number of elements in the offloading arrays.
6386 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6387
6388 // Source location for the ident struct
6389 if (!SrcLocInfo) {
6390 uint32_t SrcLocStrSize;
6391 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6392 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6393 }
6394
6395 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6396 PointerNum, RTArgs.BasePointersArray,
6397 RTArgs.PointersArray, RTArgs.SizesArray,
6398 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6399 RTArgs.MappersArray};
6400
6401 if (IsStandAlone) {
6402 assert(MapperFunc && "MapperFunc missing for standalone target data");
6404 OffloadingArgs);
6405 } else {
6406 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6407 omp::OMPRTL___tgt_target_data_begin_mapper);
6408
6409 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6410
6411 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6412 if (isa<AllocaInst>(DeviceMap.second.second)) {
6413 auto *LI =
6414 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6415 Builder.CreateStore(LI, DeviceMap.second.second);
6416 }
6417 }
6418
6419 // If device pointer privatization is required, emit the body of the
6420 // region here. It will have to be duplicated: with and without
6421 // privatization.
6423 }
6424 };
6425
6426 // If we need device pointer privatization, we need to emit the body of the
6427 // region with no privatization in the 'else' branch of the conditional.
6428 // Otherwise, we don't have to do anything.
6429 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6431 };
6432
6433 // Generate code for the closing of the data region.
6434 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6435 TargetDataRTArgs RTArgs;
6436 Info.EmitDebug = !MapInfo->Names.empty();
6437 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6438
6439 // Emit the number of elements in the offloading arrays.
6440 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6441
6442 // Source location for the ident struct
6443 if (!SrcLocInfo) {
6444 uint32_t SrcLocStrSize;
6445 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6446 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6447 }
6448
6449 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6450 PointerNum, RTArgs.BasePointersArray,
6451 RTArgs.PointersArray, RTArgs.SizesArray,
6452 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6453 RTArgs.MappersArray};
6454 Function *EndMapperFunc =
6455 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6456
6457 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6458 };
6459
6460 // We don't have to do anything to close the region if the if clause evaluates
6461 // to false.
6462 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
6463
6464 if (BodyGenCB) {
6465 if (IfCond) {
6466 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6467 } else {
6468 BeginThenGen(AllocaIP, Builder.saveIP());
6469 }
6470
6471 // If we don't require privatization of device pointers, we emit the body in
6472 // between the runtime calls. This avoids duplicating the body code.
6474
6475 if (IfCond) {
6476 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6477 } else {
6478 EndThenGen(AllocaIP, Builder.saveIP());
6479 }
6480 } else {
6481 if (IfCond) {
6482 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6483 } else {
6484 BeginThenGen(AllocaIP, Builder.saveIP());
6485 }
6486 }
6487
6488 return Builder.saveIP();
6489}
6490
6493 bool IsGPUDistribute) {
6494 assert((IVSize == 32 || IVSize == 64) &&
6495 "IV size is not compatible with the omp runtime");
6497 if (IsGPUDistribute)
6498 Name = IVSize == 32
6499 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6500 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6501 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6502 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6503 else
6504 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6505 : omp::OMPRTL___kmpc_for_static_init_4u)
6506 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6507 : omp::OMPRTL___kmpc_for_static_init_8u);
6508
6510}
6511
6513 bool IVSigned) {
6514 assert((IVSize == 32 || IVSize == 64) &&
6515 "IV size is not compatible with the omp runtime");
6516 RuntimeFunction Name = IVSize == 32
6517 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6518 : omp::OMPRTL___kmpc_dispatch_init_4u)
6519 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6520 : omp::OMPRTL___kmpc_dispatch_init_8u);
6521
6523}
6524
6526 bool IVSigned) {
6527 assert((IVSize == 32 || IVSize == 64) &&
6528 "IV size is not compatible with the omp runtime");
6529 RuntimeFunction Name = IVSize == 32
6530 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6531 : omp::OMPRTL___kmpc_dispatch_next_4u)
6532 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6533 : omp::OMPRTL___kmpc_dispatch_next_8u);
6534
6536}
6537
6539 bool IVSigned) {
6540 assert((IVSize == 32 || IVSize == 64) &&
6541 "IV size is not compatible with the omp runtime");
6542 RuntimeFunction Name = IVSize == 32
6543 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6544 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6545 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6546 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6547
6549}
6550
6552 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6553}
6554
6556 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6560 SmallVector<Type *> ParameterTypes;
6561 if (OMPBuilder.Config.isTargetDevice()) {
6562 // Add the "implicit" runtime argument we use to provide launch specific
6563 // information for target devices.
6564 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6565 ParameterTypes.push_back(Int8PtrTy);
6566
6567 // All parameters to target devices are passed as pointers
6568 // or i64. This assumes 64-bit address spaces/pointers.
6569 for (auto &Arg : Inputs)
6570 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6571 ? Arg->getType()
6572 : Type::getInt64Ty(Builder.getContext()));
6573 } else {
6574 for (auto &Arg : Inputs)
6575 ParameterTypes.push_back(Arg->getType());
6576 }
6577
6578 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6579 /*isVarArg*/ false);
6580 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
6581 Builder.GetInsertBlock()->getModule());
6582
6583 // Save insert point.
6584 auto OldInsertPoint = Builder.saveIP();
6585
6586 // Generate the region into the function.
6587 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6588 Builder.SetInsertPoint(EntryBB);
6589
6590 // Insert target init call in the device compilation pass.
6591 if (OMPBuilder.Config.isTargetDevice())
6592 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6593
6594 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6595
6596 // As we embed the user code in the middle of our target region after we
6597 // generate entry code, we must move what allocas we can into the entry
6598 // block to avoid possible breaking optimisations for device
6599 if (OMPBuilder.Config.isTargetDevice())
6601
6602 // Insert target deinit call in the device compilation pass.
6603 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
6604 if (OMPBuilder.Config.isTargetDevice())
6605 OMPBuilder.createTargetDeinit(Builder);
6606
6607 // Insert return instruction.
6608 Builder.CreateRetVoid();
6609
6610 // New Alloca IP at entry point of created device function.
6611 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6612 auto AllocaIP = Builder.saveIP();
6613
6614 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6615
6616 // Skip the artificial dyn_ptr on the device.
6617 const auto &ArgRange =
6618 OMPBuilder.Config.isTargetDevice()
6619 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6620 : Func->args();
6621
6622 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6623 // Things like GEP's can come in the form of Constants. Constants and
6624 // ConstantExpr's do not have access to the knowledge of what they're
6625 // contained in, so we must dig a little to find an instruction so we
6626 // can tell if they're used inside of the function we're outlining. We
6627 // also replace the original constant expression with a new instruction
6628 // equivalent; an instruction as it allows easy modification in the
6629 // following loop, as we can now know the constant (instruction) is
6630 // owned by our target function and replaceUsesOfWith can now be invoked
6631 // on it (cannot do this with constants it seems). A brand new one also
6632 // allows us to be cautious as it is perhaps possible the old expression
6633 // was used inside of the function but exists and is used externally
6634 // (unlikely by the nature of a Constant, but still).
6635 // NOTE: We cannot remove dead constants that have been rewritten to
6636 // instructions at this stage, we run the risk of breaking later lowering
6637 // by doing so as we could still be in the process of lowering the module
6638 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6639 // constants we have created rewritten versions of.
6640 if (auto *Const = dyn_cast<Constant>(Input))
6641 convertUsersOfConstantsToInstructions(Const, Func, false);
6642
6643 // Collect all the instructions
6644 for (User *User : make_early_inc_range(Input->users()))
6645 if (auto *Instr = dyn_cast<Instruction>(User))
6646 if (Instr->getFunction() == Func)
6647 Instr->replaceUsesOfWith(Input, InputCopy);
6648 };
6649
6650 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6651
6652 // Rewrite uses of input valus to parameters.
6653 for (auto InArg : zip(Inputs, ArgRange)) {
6654 Value *Input = std::get<0>(InArg);
6655 Argument &Arg = std::get<1>(InArg);
6656 Value *InputCopy = nullptr;
6657
6658 Builder.restoreIP(
6659 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
6660
6661 // In certain cases a Global may be set up for replacement, however, this
6662 // Global may be used in multiple arguments to the kernel, just segmented
6663 // apart, for example, if we have a global array, that is sectioned into
6664 // multiple mappings (technically not legal in OpenMP, but there is a case
6665 // in Fortran for Common Blocks where this is neccesary), we will end up
6666 // with GEP's into this array inside the kernel, that refer to the Global
6667 // but are technically seperate arguments to the kernel for all intents and
6668 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6669 // index, it will fold into an referal to the Global, if we then encounter
6670 // this folded GEP during replacement all of the references to the
6671 // Global in the kernel will be replaced with the argument we have generated
6672 // that corresponds to it, including any other GEP's that refer to the
6673 // Global that may be other arguments. This will invalidate all of the other
6674 // preceding mapped arguments that refer to the same global that may be
6675 // seperate segments. To prevent this, we defer global processing until all
6676 // other processing has been performed.
6677 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6678 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6679 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6680 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6681 continue;
6682 }
6683
6684 ReplaceValue(Input, InputCopy, Func);
6685 }
6686
6687 // Replace all of our deferred Input values, currently just Globals.
6688 for (auto Deferred : DeferredReplacement)
6689 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6690
6691 // Restore insert point.
6692 Builder.restoreIP(OldInsertPoint);
6693
6694 return Func;
6695}
6696
6697/// Create an entry point for a target task with the following.
6698/// It'll have the following signature
6699/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6700/// This function is called from emitTargetTask once the
6701/// code to launch the target kernel has been outlined already.
6703 IRBuilderBase &Builder,
6704 CallInst *StaleCI) {
6705 Module &M = OMPBuilder.M;
6706 // KernelLaunchFunction is the target launch function, i.e.
6707 // the function that sets up kernel arguments and calls
6708 // __tgt_target_kernel to launch the kernel on the device.
6709 //
6710 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6711
6712 // StaleCI is the CallInst which is the call to the outlined
6713 // target kernel launch function. If there are values that the
6714 // outlined function uses then these are aggregated into a structure
6715 // which is passed as the second argument. If not, then there's
6716 // only one argument, the threadID. So, StaleCI can be
6717 //
6718 // %structArg = alloca { ptr, ptr }, align 8
6719 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6720 // store ptr %20, ptr %gep_, align 8
6721 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6722 // store ptr %21, ptr %gep_8, align 8
6723 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6724 //
6725 // OR
6726 //
6727 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6729 StaleCI->getIterator());
6730 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6731 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6732 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6733 Type *TaskTy = OMPBuilder.Task;
6734 auto ProxyFnTy =
6735 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6736 /* isVarArg */ false);
6737 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6738 ".omp_target_task_proxy_func",
6739 Builder.GetInsertBlock()->getModule());
6740 ProxyFn->getArg(0)->setName("thread.id");
6741 ProxyFn->getArg(1)->setName("task");
6742
6743 BasicBlock *EntryBB =
6744 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6745 Builder.SetInsertPoint(EntryBB);
6746
6747 bool HasShareds = StaleCI->arg_size() > 1;
6748 // TODO: This is a temporary assert to prove to ourselves that
6749 // the outlined target launch function is always going to have
6750 // atmost two arguments if there is any data shared between
6751 // host and device.
6752 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
6753 "StaleCI with shareds should have exactly two arguments.");
6754 if (HasShareds) {
6755 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6756 assert(ArgStructAlloca &&
6757 "Unable to find the alloca instruction corresponding to arguments "
6758 "for extracted function");
6759 auto *ArgStructType =
6760 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
6761
6762 AllocaInst *NewArgStructAlloca =
6763 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
6764 Value *TaskT = ProxyFn->getArg(1);
6765 Value *ThreadId = ProxyFn->getArg(0);
6766 Value *SharedsSize =
6767 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
6768
6769 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
6770 LoadInst *LoadShared =
6771 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
6772
6773 Builder.CreateMemCpy(
6774 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
6775 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
6776
6777 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
6778 }
6779 Builder.CreateRetVoid();
6780 return ProxyFn;
6781}
6783 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
6784 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
6785 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
6788
6789 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
6790 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
6791 &ArgAccessorFuncCB](StringRef EntryFnName) {
6792 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
6793 CBFunc, ArgAccessorFuncCB);
6794 };
6795
6796 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
6797 IsOffloadEntry, OutlinedFn, OutlinedFnID);
6798}
6800 Function *OutlinedFn, Value *OutlinedFnID,
6801 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
6802 Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
6804 bool HasNoWait) {
6805
6806 // When we arrive at this function, the target region itself has been
6807 // outlined into the function OutlinedFn.
6808 // So at ths point, for
6809 // --------------------------------------------------
6810 // void user_code_that_offloads(...) {
6811 // omp target depend(..) map(from:a) map(to:b, c)
6812 // a = b + c
6813 // }
6814 //
6815 // --------------------------------------------------
6816 //
6817 // we have
6818 //
6819 // --------------------------------------------------
6820 //
6821 // void user_code_that_offloads(...) {
6822 // %.offload_baseptrs = alloca [3 x ptr], align 8
6823 // %.offload_ptrs = alloca [3 x ptr], align 8
6824 // %.offload_mappers = alloca [3 x ptr], align 8
6825 // ;; target region has been outlined and now we need to
6826 // ;; offload to it via a target task.
6827 // }
6828 // void outlined_device_function(ptr a, ptr b, ptr c) {
6829 // *a = *b + *c
6830 // }
6831 //
6832 // We have to now do the following
6833 // (i) Make an offloading call to outlined_device_function using the OpenMP
6834 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
6835 // emitted by emitKernelLaunch
6836 // (ii) Create a task entry point function that calls kernel_launch_function
6837 // and is the entry point for the target task. See
6838 // '@.omp_target_task_proxy_func in the pseudocode below.
6839 // (iii) Create a task with the task entry point created in (ii)
6840 //
6841 // That is we create the following
6842 //
6843 // void user_code_that_offloads(...) {
6844 // %.offload_baseptrs = alloca [3 x ptr], align 8
6845 // %.offload_ptrs = alloca [3 x ptr], align 8
6846 // %.offload_mappers = alloca [3 x ptr], align 8
6847 //
6848 // %structArg = alloca { ptr, ptr, ptr }, align 8
6849 // %strucArg[0] = %.offload_baseptrs
6850 // %strucArg[1] = %.offload_ptrs
6851 // %strucArg[2] = %.offload_mappers
6852 // proxy_target_task = @__kmpc_omp_task_alloc(...,
6853 // @.omp_target_task_proxy_func)
6854 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
6855 // dependencies_array = ...
6856 // ;; if nowait not present
6857 // call @__kmpc_omp_wait_deps(..., dependencies_array)
6858 // call @__kmpc_omp_task_begin_if0(...)
6859 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
6860 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
6861 // }
6862 //
6863 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
6864 // ptr %task) {
6865 // %structArg = alloca {ptr, ptr, ptr}
6866 // %shared_data = load (getelementptr %task, 0, 0)
6867 // mempcy(%structArg, %shared_data, sizeof(structArg))
6868 // kernel_launch_function(%thread.id, %structArg)
6869 // }
6870 //
6871 // We need the proxy function because the signature of the task entry point
6872 // expected by kmpc_omp_task is always the same and will be different from
6873 // that of the kernel_launch function.
6874 //
6875 // kernel_launch_function is generated by emitKernelLaunch and has the
6876 // always_inline attribute.
6877 // void kernel_launch_function(thread_id,
6878 // structArg) alwaysinline {
6879 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
6880 // offload_baseptrs = load(getelementptr structArg, 0, 0)
6881 // offload_ptrs = load(getelementptr structArg, 0, 1)
6882 // offload_mappers = load(getelementptr structArg, 0, 2)
6883 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
6884 // ; offload_mappers
6885 // call i32 @__tgt_target_kernel(...,
6886 // outlined_device_function,
6887 // ptr %kernel_args)
6888 // }
6889 // void outlined_device_function(ptr a, ptr b, ptr c) {
6890 // *a = *b + *c
6891 // }
6892 //
6893 BasicBlock *TargetTaskBodyBB =
6894 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
6895 BasicBlock *TargetTaskAllocaBB =
6896 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
6897
6898 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
6899 TargetTaskAllocaBB->begin());
6900 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
6901
6902 OutlineInfo OI;
6903 OI.EntryBB = TargetTaskAllocaBB;
6904 OI.OuterAllocaBB = AllocaIP.getBlock();
6905
6906 // Add the thread ID argument.
6909 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
6910
6911 Builder.restoreIP(TargetTaskBodyIP);
6912
6913 if (OutlinedFnID) {
6914 // emitKernelLaunch makes the necessary runtime call to offload the kernel.
6915 // We then outline all that code into a separate function
6916 // ('kernel_launch_function' in the pseudo code above). This function is
6917 // then called by the target task proxy function (see
6918 // '@.omp_target_task_proxy_func' in the pseudo code above)
6919 // "@.omp_target_task_proxy_func' is generated by
6920 // emitTargetTaskProxyFunction.
6921 Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
6922 EmitTargetCallFallbackCB, Args, DeviceID,
6923 RTLoc, TargetTaskAllocaIP));
6924 } else {
6925 // When OutlinedFnID is set to nullptr, then it's not an offloading call. In
6926 // this case, we execute the host implementation directly.
6927 Builder.restoreIP(EmitTargetCallFallbackCB(Builder.saveIP()));
6928 }
6929
6930 OI.ExitBB = Builder.saveIP().getBlock();
6931 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
6932 HasNoWait](Function &OutlinedFn) mutable {
6933 assert(OutlinedFn.getNumUses() == 1 &&
6934 "there must be a single user for the outlined function");
6935
6936 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
6937 bool HasShareds = StaleCI->arg_size() > 1;
6938
6939 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
6940
6941 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
6942 << "\n");
6943
6944 Builder.SetInsertPoint(StaleCI);
6945
6946 // Gather the arguments for emitting the runtime call.
6947 uint32_t SrcLocStrSize;
6948 Constant *SrcLocStr =
6950 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6951
6952 // @__kmpc_omp_task_alloc
6953 Function *TaskAllocFn =
6954 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
6955
6956 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
6957 // call.
6958 Value *ThreadID = getOrCreateThreadID(Ident);
6959
6960 // Argument - `sizeof_kmp_task_t` (TaskSize)
6961 // Tasksize refers to the size in bytes of kmp_task_t data structure
6962 // including private vars accessed in task.
6963 // TODO: add kmp_task_t_with_privates (privates)
6964 Value *TaskSize =
6966
6967 // Argument - `sizeof_shareds` (SharedsSize)
6968 // SharedsSize refers to the shareds array size in the kmp_task_t data
6969 // structure.
6970 Value *SharedsSize = Builder.getInt64(0);
6971 if (HasShareds) {
6972 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6973 assert(ArgStructAlloca &&
6974 "Unable to find the alloca instruction corresponding to arguments "
6975 "for extracted function");
6976 auto *ArgStructType =
6977 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
6978 assert(ArgStructType && "Unable to find struct type corresponding to "
6979 "arguments for extracted function");
6980 SharedsSize =
6982 }
6983
6984 // Argument - `flags`
6985 // Task is tied iff (Flags & 1) == 1.
6986 // Task is untied iff (Flags & 1) == 0.
6987 // Task is final iff (Flags & 2) == 2.
6988 // Task is not final iff (Flags & 2) == 0.
6989 // A target task is not final and is untied.
6991
6992 // Emit the @__kmpc_omp_task_alloc runtime call
6993 // The runtime call returns a pointer to an area where the task captured
6994 // variables must be copied before the task is run (TaskData)
6995 CallInst *TaskData = Builder.CreateCall(
6996 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
6997 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
6998 /*task_func=*/ProxyFn});
6999
7000 if (HasShareds) {
7001 Value *Shareds = StaleCI->getArgOperand(1);
7002 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7003 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7004 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7005 SharedsSize);
7006 }
7007
7008 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7009
7010 // ---------------------------------------------------------------
7011 // V5.2 13.8 target construct
7012 // If the nowait clause is present, execution of the target task
7013 // may be deferred. If the nowait clause is not present, the target task is
7014 // an included task.
7015 // ---------------------------------------------------------------
7016 // The above means that the lack of a nowait on the target construct
7017 // translates to '#pragma omp task if(0)'
7018 if (!HasNoWait) {
7019 if (DepArray) {
7020 Function *TaskWaitFn =
7021 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7023 TaskWaitFn,
7024 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7025 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7026 /*dep_list=*/DepArray,
7027 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7028 /*noalias_dep_list=*/
7030 }
7031 // Included task.
7032 Function *TaskBeginFn =
7033 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7034 Function *TaskCompleteFn =
7035 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7036 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7037 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7038 CI->setDebugLoc(StaleCI->getDebugLoc());
7039 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7040 } else if (DepArray) {
7041 // HasNoWait - meaning the task may be deferred. Call
7042 // __kmpc_omp_task_with_deps if there are dependencies,
7043 // else call __kmpc_omp_task
7044 Function *TaskFn =
7045 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7047 TaskFn,
7048 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7049 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7051 } else {
7052 // Emit the @__kmpc_omp_task runtime call to spawn the task
7053 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7054 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7055 }
7056
7057 StaleCI->eraseFromParent();
7058 for (Instruction *I : llvm::reverse(ToBeDeleted))
7059 I->eraseFromParent();
7060 };
7061 addOutlineInfo(std::move(OI));
7062
7063 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7064 << *(Builder.GetInsertBlock()) << "\n");
7065 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7067 << "\n");
7068 return Builder.saveIP();
7069}
7070
7072 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7073 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7074 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7075 function_ref<Value *(unsigned int)> CustomMapperCB) {
7076 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7077 DeviceAddrCB, CustomMapperCB);
7078 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7079}
7080
7081static void emitTargetCall(
7082 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7083 OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
7084 Constant *OutlinedFnID, ArrayRef<int32_t> NumTeams,
7088 // Generate a function call to the host fallback implementation of the target
7089 // region. This is called by the host when no offload entry was generated for
7090 // the target region and when the offloading call fails at runtime.
7091 auto &&EmitTargetCallFallbackCB =
7093 Builder.restoreIP(IP);
7094 Builder.CreateCall(OutlinedFn, Args);
7095 return Builder.saveIP();
7096 };
7097
7098 bool HasNoWait = false;
7099 bool HasDependencies = Dependencies.size() > 0;
7100 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7101
7102 // If we don't have an ID for the target region, it means an offload entry
7103 // wasn't created. In this case we just run the host fallback directly.
7104 if (!OutlinedFnID) {
7105 if (RequiresOuterTargetTask) {
7106 // Arguments that are intended to be directly forwarded to an
7107 // emitKernelLaunch call are pased as nullptr, since OutlinedFnID=nullptr
7108 // results in that call not being done.
7110 Builder.restoreIP(OMPBuilder.emitTargetTask(
7111 OutlinedFn, /*OutlinedFnID=*/nullptr, EmitTargetCallFallbackCB, KArgs,
7112 /*DeviceID=*/nullptr, /*RTLoc=*/nullptr, AllocaIP, Dependencies,
7113 HasNoWait));
7114 } else {
7115 Builder.restoreIP(EmitTargetCallFallbackCB(Builder.saveIP()));
7116 }
7117 return;
7118 }
7119
7121 /*RequiresDevicePointerInfo=*/false,
7122 /*SeparateBeginEndCalls=*/true);
7123
7124 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7126 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7127 RTArgs, MapInfo,
7128 /*IsNonContiguous=*/true,
7129 /*ForEndCall=*/false);
7130
7131 SmallVector<Value *, 3> NumTeamsC;
7132 SmallVector<Value *, 3> NumThreadsC;
7133 for (auto V : NumTeams)
7134 NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7135 for (auto V : NumThreads)
7136 NumThreadsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7137
7138 unsigned NumTargetItems = Info.NumberOfPtrs;
7139 // TODO: Use correct device ID
7140 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7141 uint32_t SrcLocStrSize;
7142 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7143 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7144 llvm::omp::IdentFlag(0), 0);
7145 // TODO: Use correct NumIterations
7146 Value *NumIterations = Builder.getInt64(0);
7147 // TODO: Use correct DynCGGroupMem
7148 Value *DynCGGroupMem = Builder.getInt32(0);
7149
7150 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
7151 NumTeamsC, NumThreadsC, DynCGGroupMem,
7152 HasNoWait);
7153
7154 // The presence of certain clauses on the target directive require the
7155 // explicit generation of the target task.
7156 if (RequiresOuterTargetTask) {
7157 Builder.restoreIP(OMPBuilder.emitTargetTask(
7158 OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
7159 RTLoc, AllocaIP, Dependencies, HasNoWait));
7160 } else {
7161 Builder.restoreIP(OMPBuilder.emitKernelLaunch(
7162 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
7163 DeviceID, RTLoc, AllocaIP));
7164 }
7165}
7166
7168 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7169 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7170 ArrayRef<int32_t> NumTeams, ArrayRef<int32_t> NumThreads,
7174 SmallVector<DependData> Dependencies) {
7175
7176 if (!updateToLocation(Loc))
7177 return InsertPointTy();
7178
7179 Builder.restoreIP(CodeGenIP);
7180
7181 Function *OutlinedFn;
7182 Constant *OutlinedFnID = nullptr;
7183 // The target region is outlined into its own function. The LLVM IR for
7184 // the target region itself is generated using the callbacks CBFunc
7185 // and ArgAccessorFuncCB
7186 emitTargetOutlinedFunction(*this, Builder, IsOffloadEntry, EntryInfo,
7187 OutlinedFn, OutlinedFnID, Args, CBFunc,
7188 ArgAccessorFuncCB);
7189
7190 // If we are not on the target device, then we need to generate code
7191 // to make a remote call (offload) to the previously outlined function
7192 // that represents the target region. Do that now.
7193 if (!Config.isTargetDevice())
7194 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
7195 NumThreads, Args, GenMapInfoCB, Dependencies);
7196 return Builder.saveIP();
7197}
7198
7199std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7200 StringRef FirstSeparator,
7201 StringRef Separator) {
7202 SmallString<128> Buffer;
7204 StringRef Sep = FirstSeparator;
7205 for (StringRef Part : Parts) {
7206 OS << Sep << Part;
7207 Sep = Separator;
7208 }
7209 return OS.str().str();
7210}
7211
7212std::string
7214 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7215 Config.separator());
7216}
7217
7220 unsigned AddressSpace) {
7221 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7222 if (Elem.second) {
7223 assert(Elem.second->getValueType() == Ty &&
7224 "OMP internal variable has different type than requested");
7225 } else {
7226 // TODO: investigate the appropriate linkage type used for the global
7227 // variable for possibly changing that to internal or private, or maybe
7228 // create different versions of the function for different OMP internal
7229 // variables.
7230 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7233 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7234 Constant::getNullValue(Ty), Elem.first(),
7235 /*InsertBefore=*/nullptr,
7237 const DataLayout &DL = M.getDataLayout();
7238 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7239 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7240 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7241 Elem.second = GV;
7242 }
7243
7244 return Elem.second;
7245}
7246
7247Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7248 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7249 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7250 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7251}
7252
7255 Value *Null =
7256 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7257 Value *SizeGep =
7258 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7259 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7260 return SizePtrToInt;
7261}
7262
7265 std::string VarName) {
7266 llvm::Constant *MaptypesArrayInit =
7268 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7269 M, MaptypesArrayInit->getType(),
7270 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7271 VarName);
7272 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7273 return MaptypesArrayGlobal;
7274}
7275
7277 InsertPointTy AllocaIP,
7278 unsigned NumOperands,
7279 struct MapperAllocas &MapperAllocas) {
7280 if (!updateToLocation(Loc))
7281 return;
7282
7283 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7284 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7285 Builder.restoreIP(AllocaIP);
7286 AllocaInst *ArgsBase = Builder.CreateAlloca(
7287 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7288 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7289 ".offload_ptrs");
7290 AllocaInst *ArgSizes = Builder.CreateAlloca(
7291 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7292 Builder.restoreIP(Loc.IP);
7293 MapperAllocas.ArgsBase = ArgsBase;
7294 MapperAllocas.Args = Args;
7295 MapperAllocas.ArgSizes = ArgSizes;
7296}
7297
7299 Function *MapperFunc, Value *SrcLocInfo,
7300 Value *MaptypesArg, Value *MapnamesArg,
7302 int64_t DeviceID, unsigned NumOperands) {
7303 if (!updateToLocation(Loc))
7304 return;
7305
7306 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7307 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7308 Value *ArgsBaseGEP =
7310 {Builder.getInt32(0), Builder.getInt32(0)});
7311 Value *ArgsGEP =
7313 {Builder.getInt32(0), Builder.getInt32(0)});
7314 Value *ArgSizesGEP =
7316 {Builder.getInt32(0), Builder.getInt32(0)});
7317 Value *NullPtr =
7318 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7319 Builder.CreateCall(MapperFunc,
7320 {SrcLocInfo, Builder.getInt64(DeviceID),
7321 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7322 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7323}
7324
7326 TargetDataRTArgs &RTArgs,
7327 TargetDataInfo &Info,
7328 bool ForEndCall) {
7329 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7330 "expected region end call to runtime only when end call is separate");
7331 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7332 auto VoidPtrTy = UnqualPtrTy;
7333 auto VoidPtrPtrTy = UnqualPtrTy;
7334 auto Int64Ty = Type::getInt64Ty(M.getContext());
7335 auto Int64PtrTy = UnqualPtrTy;
7336
7337 if (!Info.NumberOfPtrs) {
7338 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7339 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7340 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7341 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7342 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7343 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7344 return;
7345 }
7346
7348 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7349 Info.RTArgs.BasePointersArray,
7350 /*Idx0=*/0, /*Idx1=*/0);
7352 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7353 /*Idx0=*/0,
7354 /*Idx1=*/0);
7356 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7357 /*Idx0=*/0, /*Idx1=*/0);
7359 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7360 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7361 : Info.RTArgs.MapTypesArray,
7362 /*Idx0=*/0,
7363 /*Idx1=*/0);
7364
7365 // Only emit the mapper information arrays if debug information is
7366 // requested.
7367 if (!Info.EmitDebug)
7368 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7369 else
7371 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7372 /*Idx0=*/0,
7373 /*Idx1=*/0);
7374 // If there is no user-defined mapper, set the mapper array to nullptr to
7375 // avoid an unnecessary data privatization
7376 if (!Info.HasMapper)
7377 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7378 else
7379 RTArgs.MappersArray =
7380 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7381}
7382
7384 InsertPointTy CodeGenIP,
7385 MapInfosTy &CombinedInfo,
7386 TargetDataInfo &Info) {
7388 CombinedInfo.NonContigInfo;
7389
7390 // Build an array of struct descriptor_dim and then assign it to
7391 // offload_args.
7392 //
7393 // struct descriptor_dim {
7394 // uint64_t offset;
7395 // uint64_t count;
7396 // uint64_t stride
7397 // };
7398 Type *Int64Ty = Builder.getInt64Ty();
7400 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7401 "struct.descriptor_dim");
7402
7403 enum { OffsetFD = 0, CountFD, StrideFD };
7404 // We need two index variable here since the size of "Dims" is the same as
7405 // the size of Components, however, the size of offset, count, and stride is
7406 // equal to the size of base declaration that is non-contiguous.
7407 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7408 // Skip emitting ir if dimension size is 1 since it cannot be
7409 // non-contiguous.
7410 if (NonContigInfo.Dims[I] == 1)
7411 continue;
7412 Builder.restoreIP(AllocaIP);
7413 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7414 AllocaInst *DimsAddr =
7415 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7416 Builder.restoreIP(CodeGenIP);
7417 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7418 unsigned RevIdx = EE - II - 1;
7419 Value *DimsLVal = Builder.CreateInBoundsGEP(
7420 DimsAddr->getAllocatedType(), DimsAddr,
7421 {Builder.getInt64(0), Builder.getInt64(II)});
7422 // Offset
7423 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7425 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7426 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7427 // Count
7428 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7430 NonContigInfo.Counts[L][RevIdx], CountLVal,
7431 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7432 // Stride
7433 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7435 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7436 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7437 }
7438 // args[I] = &dims
7439 Builder.restoreIP(CodeGenIP);
7441 DimsAddr, Builder.getPtrTy());
7443 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7444 Info.RTArgs.PointersArray, 0, I);
7447 ++L;
7448 }
7449}
7450
7452 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
7453 TargetDataInfo &Info, bool IsNonContiguous,
7454 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7455 function_ref<Value *(unsigned int)> CustomMapperCB) {
7456
7457 // Reset the array information.
7458 Info.clearArrayInfo();
7459 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
7460
7461 if (Info.NumberOfPtrs == 0)
7462 return;
7463
7464 Builder.restoreIP(AllocaIP);
7465 // Detect if we have any capture size requiring runtime evaluation of the
7466 // size so that a constant array could be eventually used.
7467 ArrayType *PointerArrayType =
7468 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
7469
7470 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
7471 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
7472
7473 Info.RTArgs.PointersArray = Builder.CreateAlloca(
7474 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
7475 AllocaInst *MappersArray = Builder.CreateAlloca(
7476 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
7477 Info.RTArgs.MappersArray = MappersArray;
7478
7479 // If we don't have any VLA types or other types that require runtime
7480 // evaluation, we can use a constant array for the map sizes, otherwise we
7481 // need to fill up the arrays as we do for the pointers.
7482 Type *Int64Ty = Builder.getInt64Ty();
7483 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
7484 ConstantInt::get(Int64Ty, 0));
7485 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
7486 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
7487 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
7488 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
7489 if (IsNonContiguous &&
7490 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7491 CombinedInfo.Types[I] &
7492 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
7493 ConstSizes[I] =
7494 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
7495 else
7496 ConstSizes[I] = CI;
7497 continue;
7498 }
7499 }
7500 RuntimeSizes.set(I);
7501 }
7502
7503 if (RuntimeSizes.all()) {
7504 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7505 Info.RTArgs.SizesArray = Builder.CreateAlloca(
7506 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7507 Builder.restoreIP(CodeGenIP);
7508 } else {
7509 auto *SizesArrayInit = ConstantArray::get(
7510 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
7511 std::string Name = createPlatformSpecificName({"offload_sizes"});
7512 auto *SizesArrayGbl =
7513 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
7514 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
7515 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
7516
7517 if (!RuntimeSizes.any()) {
7518 Info.RTArgs.SizesArray = SizesArrayGbl;
7519 } else {
7520 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7521 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
7522 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7524 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7525 Buffer->setAlignment(OffloadSizeAlign);
7526 Builder.restoreIP(CodeGenIP);
7528 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
7529 SizesArrayGbl, OffloadSizeAlign,
7531 IndexSize,
7532 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
7533
7534 Info.RTArgs.SizesArray = Buffer;
7535 }
7536 Builder.restoreIP(CodeGenIP);
7537 }
7538
7539 // The map types are always constant so we don't need to generate code to
7540 // fill arrays. Instead, we create an array constant.
7542 for (auto mapFlag : CombinedInfo.Types)
7543 Mapping.push_back(
7544 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7545 mapFlag));
7546 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
7547 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7548 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
7549
7550 // The information types are only built if provided.
7551 if (!CombinedInfo.Names.empty()) {
7552 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
7553 auto *MapNamesArrayGbl =
7554 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
7555 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
7556 Info.EmitDebug = true;
7557 } else {
7558 Info.RTArgs.MapNamesArray =
7560 Info.EmitDebug = false;
7561 }
7562
7563 // If there's a present map type modifier, it must not be applied to the end
7564 // of a region, so generate a separate map type array in that case.
7565 if (Info.separateBeginEndCalls()) {
7566 bool EndMapTypesDiffer = false;
7567 for (uint64_t &Type : Mapping) {
7568 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7569 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
7570 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7571 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
7572 EndMapTypesDiffer = true;
7573 }
7574 }
7575 if (EndMapTypesDiffer) {
7576 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7577 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
7578 }
7579 }
7580
7581 PointerType *PtrTy = Builder.getPtrTy();
7582 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
7583 Value *BPVal = CombinedInfo.BasePointers[I];
7585 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
7586 0, I);
7587 Builder.CreateAlignedStore(BPVal, BP,
7589
7590 if (Info.requiresDevicePointerInfo()) {
7591 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
7592 CodeGenIP = Builder.saveIP();
7593 Builder.restoreIP(AllocaIP);
7594 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
7595 Builder.restoreIP(CodeGenIP);
7596 if (DeviceAddrCB)
7597 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
7598 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
7599 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
7600 if (DeviceAddrCB)
7601 DeviceAddrCB(I, BP);
7602 }
7603 }
7604
7605 Value *PVal = CombinedInfo.Pointers[I];
7607 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
7608 I);
7609 // TODO: Check alignment correct.
7612
7613 if (RuntimeSizes.test(I)) {
7615 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7616 /*Idx0=*/0,
7617 /*Idx1=*/I);
7619 Int64Ty,
7620 /*isSigned=*/true),
7621 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
7622 }
7623 // Fill up the mapper array.
7624 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7625 Value *MFunc = ConstantPointerNull::get(PtrTy);
7626 if (CustomMapperCB)
7627 if (Value *CustomMFunc = CustomMapperCB(I))
7628 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
7630 MappersArray->getAllocatedType(), MappersArray,
7631 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
7633 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
7634 }
7635
7636 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
7637 Info.NumberOfPtrs == 0)
7638 return;
7639 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
7640}
7641
7644
7645 if (!CurBB || CurBB->getTerminator()) {
7646 // If there is no insert point or the previous block is already
7647 // terminated, don't touch it.
7648 } else {
7649 // Otherwise, create a fall-through branch.
7651 }
7652
7654}
7655
7657 bool IsFinished) {
7659
7660 // Fall out of the current block (if necessary).
7661 emitBranch(BB);
7662
7663 if (IsFinished && BB->use_empty()) {
7664 BB->eraseFromParent();
7665 return;
7666 }
7667
7668 // Place the block after the current block, if possible, or else at
7669 // the end of the function.
7670 if (CurBB && CurBB->getParent())
7671 CurFn->insert(std::next(CurBB->getIterator()), BB);
7672 else
7673 CurFn->insert(CurFn->end(), BB);
7675}
7676
7678 BodyGenCallbackTy ElseGen,
7679 InsertPointTy AllocaIP) {
7680 // If the condition constant folds and can be elided, try to avoid emitting
7681 // the condition and the dead arm of the if/else.
7682 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
7683 auto CondConstant = CI->getSExtValue();
7684 if (CondConstant)
7685 ThenGen(AllocaIP, Builder.saveIP());
7686 else
7687 ElseGen(AllocaIP, Builder.saveIP());
7688 return;
7689 }
7690
7692
7693 // Otherwise, the condition did not fold, or we couldn't elide it. Just
7694 // emit the conditional branch.
7695 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
7696 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
7697 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
7698 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
7699 // Emit the 'then' code.
7700 emitBlock(ThenBlock, CurFn);
7701 ThenGen(AllocaIP, Builder.saveIP());
7702 emitBranch(ContBlock);
7703 // Emit the 'else' code if present.
7704 // There is no need to emit line number for unconditional branch.
7705 emitBlock(ElseBlock, CurFn);
7706 ElseGen(AllocaIP, Builder.saveIP());
7707 // There is no need to emit line number for unconditional branch.
7708 emitBranch(ContBlock);
7709 // Emit the continuation block for code after the if.
7710 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
7711}
7712
7713bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
7714 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
7717 "Unexpected Atomic Ordering.");
7718
7719 bool Flush = false;
7721
7722 switch (AK) {
7723 case Read:
7726 FlushAO = AtomicOrdering::Acquire;
7727 Flush = true;
7728 }
7729 break;
7730 case Write:
7731 case Compare:
7732 case Update:
7735 FlushAO = AtomicOrdering::Release;
7736 Flush = true;
7737 }
7738 break;
7739 case Capture:
7740 switch (AO) {
7742 FlushAO = AtomicOrdering::Acquire;
7743 Flush = true;
7744 break;
7746 FlushAO = AtomicOrdering::Release;
7747 Flush = true;
7748 break;
7752 Flush = true;
7753 break;
7754 default:
7755 // do nothing - leave silently.
7756 break;
7757 }
7758 }
7759
7760 if (Flush) {
7761 // Currently Flush RT call still doesn't take memory_ordering, so for when
7762 // that happens, this tries to do the resolution of which atomic ordering
7763 // to use with but issue the flush call
7764 // TODO: pass `FlushAO` after memory ordering support is added
7765 (void)FlushAO;
7766 emitFlush(Loc);
7767 }
7768
7769 // for AO == AtomicOrdering::Monotonic and all other case combinations
7770 // do nothing
7771 return Flush;
7772}
7773
7777 AtomicOrdering AO) {
7778 if (!updateToLocation(Loc))
7779 return Loc.IP;
7780
7781 assert(X.Var->getType()->isPointerTy() &&
7782 "OMP Atomic expects a pointer to target memory");
7783 Type *XElemTy = X.ElemTy;
7784 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7785 XElemTy->isPointerTy()) &&
7786 "OMP atomic read expected a scalar type");
7787
7788 Value *XRead = nullptr;
7789
7790 if (XElemTy->isIntegerTy()) {
7791 LoadInst *XLD =
7792 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
7793 XLD->setAtomic(AO);
7794 XRead = cast<Value>(XLD);
7795 } else {
7796 // We need to perform atomic op as integer
7797 IntegerType *IntCastTy =
7799 LoadInst *XLoad =
7800 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
7801 XLoad->setAtomic(AO);
7802 if (XElemTy->isFloatingPointTy()) {
7803 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
7804 } else {
7805 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
7806 }
7807 }
7808 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
7809 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
7810 return Builder.saveIP();
7811}
7812
7815 AtomicOpValue &X, Value *Expr,
7816 AtomicOrdering AO) {
7817 if (!updateToLocation(Loc))
7818 return Loc.IP;
7819
7820 assert(X.Var->getType()->isPointerTy() &&
7821 "OMP Atomic expects a pointer to target memory");
7822 Type *XElemTy = X.ElemTy;
7823 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7824 XElemTy->isPointerTy()) &&
7825 "OMP atomic write expected a scalar type");
7826
7827 if (XElemTy->isIntegerTy()) {
7828 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
7829 XSt->setAtomic(AO);
7830 } else {
7831 // We need to bitcast and perform atomic op as integers
7832 IntegerType *IntCastTy =
7834 Value *ExprCast =
7835 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
7836 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
7837 XSt->setAtomic(AO);
7838 }
7839
7840 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
7841 return Builder.saveIP();
7842}
7843
7845 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7846 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
7847 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
7848 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
7849 if (!updateToLocation(Loc))
7850 return Loc.IP;
7851
7852 LLVM_DEBUG({
7853 Type *XTy = X.Var->getType();
7854 assert(XTy->isPointerTy() &&
7855 "OMP Atomic expects a pointer to target memory");
7856 Type *XElemTy = X.ElemTy;
7857 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7858 XElemTy->isPointerTy()) &&
7859 "OMP atomic update expected a scalar type");
7860 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
7861 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
7862 "OpenMP atomic does not support LT or GT operations");
7863 });
7864
7865 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
7866 X.IsVolatile, IsXBinopExpr);
7867 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
7868 return Builder.saveIP();
7869}
7870
7871// FIXME: Duplicating AtomicExpand
7872Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
7873 AtomicRMWInst::BinOp RMWOp) {
7874 switch (RMWOp) {
7875 case AtomicRMWInst::Add:
7876 return Builder.CreateAdd(Src1, Src2);
7877 case AtomicRMWInst::Sub:
7878 return Builder.CreateSub(Src1, Src2);
7879 case AtomicRMWInst::And:
7880 return Builder.CreateAnd(Src1, Src2);
7882 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
7883 case AtomicRMWInst::Or:
7884 return Builder.CreateOr(Src1, Src2);
7885 case AtomicRMWInst::Xor:
7886 return Builder.CreateXor(Src1, Src2);
7891 case AtomicRMWInst::Max:
7892 case AtomicRMWInst::Min:
7899 llvm_unreachable("Unsupported atomic update operation");
7900 }
7901 llvm_unreachable("Unsupported atomic update operation");
7902}
7903
7904std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
7905 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
7907 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
7908 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
7909 // or a complex datatype.
7910 bool emitRMWOp = false;
7911 switch (RMWOp) {
7912 case AtomicRMWInst::Add:
7913 case AtomicRMWInst::And:
7915 case AtomicRMWInst::Or:
7916 case AtomicRMWInst::Xor:
7918 emitRMWOp = XElemTy;
7919 break;
7920 case AtomicRMWInst::Sub:
7921 emitRMWOp = (IsXBinopExpr && XElemTy);
7922 break;
7923 default:
7924 emitRMWOp = false;
7925 }
7926 emitRMWOp &= XElemTy->isIntegerTy();
7927
7928 std::pair<Value *, Value *> Res;
7929 if (emitRMWOp) {
7930 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
7931 // not needed except in case of postfix captures. Generate anyway for
7932 // consistency with the else part. Will be removed with any DCE pass.
7933 // AtomicRMWInst::Xchg does not have a coressponding instruction.
7934 if (RMWOp == AtomicRMWInst::Xchg)
7935 Res.second = Res.first;
7936 else
7937 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
7938 } else {
7939 IntegerType *IntCastTy =
7941 LoadInst *OldVal =
7942 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
7943 OldVal->setAtomic(AO);
7944 // CurBB
7945 // | /---\
7946 // ContBB |
7947 // | \---/
7948 // ExitBB
7950 Instruction *CurBBTI = CurBB->getTerminator();
7951 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
7952 BasicBlock *ExitBB =
7953 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
7954 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
7955 X->getName() + ".atomic.cont");
7956 ContBB->getTerminator()->eraseFromParent();
7957 Builder.restoreIP(AllocaIP);
7958 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
7959 NewAtomicAddr->setName(X->getName() + "x.new.val");
7960 Builder.SetInsertPoint(ContBB);
7961 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
7962 PHI->addIncoming(OldVal, CurBB);
7963 bool IsIntTy = XElemTy->isIntegerTy();
7964 Value *OldExprVal = PHI;
7965 if (!IsIntTy) {
7966 if (XElemTy->isFloatingPointTy()) {
7967 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
7968 X->getName() + ".atomic.fltCast");
7969 } else {
7970 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
7971 X->getName() + ".atomic.ptrCast");
7972 }
7973 }
7974
7975 Value *Upd = UpdateOp(OldExprVal, Builder);
7976 Builder.CreateStore(Upd, NewAtomicAddr);
7977 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
7981 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
7982 Result->setVolatile(VolatileX);
7983 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
7984 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
7985 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
7986 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
7987
7988 Res.first = OldExprVal;
7989 Res.second = Upd;
7990
7991 // set Insertion point in exit block
7992 if (UnreachableInst *ExitTI =
7993 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
7994 CurBBTI->eraseFromParent();
7995 Builder.SetInsertPoint(ExitBB);
7996 } else {
7997 Builder.SetInsertPoint(ExitTI);
7998 }
7999 }
8000
8001 return Res;
8002}
8003
8005 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8006 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8008 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8009 if (!updateToLocation(Loc))
8010 return Loc.IP;
8011
8012 LLVM_DEBUG({
8013 Type *XTy = X.Var->getType();
8014 assert(XTy->isPointerTy() &&
8015 "OMP Atomic expects a pointer to target memory");
8016 Type *XElemTy = X.ElemTy;
8017 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8018 XElemTy->isPointerTy()) &&
8019 "OMP atomic capture expected a scalar type");
8020 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8021 "OpenMP atomic does not support LT or GT operations");
8022 });
8023
8024 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8025 // 'x' is simply atomically rewritten with 'expr'.
8026 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8027 std::pair<Value *, Value *> Result =
8028 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8029 X.IsVolatile, IsXBinopExpr);
8030
8031 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
8032 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8033
8034 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8035 return Builder.saveIP();
8036}
8037
8041 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8042 bool IsFailOnly) {
8043
8045 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8046 IsPostfixUpdate, IsFailOnly, Failure);
8047}
8048
8052 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8053 bool IsFailOnly, AtomicOrdering Failure) {
8054
8055 if (!updateToLocation(Loc))
8056 return Loc.IP;
8057
8058 assert(X.Var->getType()->isPointerTy() &&
8059 "OMP atomic expects a pointer to target memory");
8060 // compare capture
8061 if (V.Var) {
8062 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8063 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8064 }
8065
8066 bool IsInteger = E->getType()->isIntegerTy();
8067
8068 if (Op == OMPAtomicCompareOp::EQ) {
8069 AtomicCmpXchgInst *Result = nullptr;
8070 if (!IsInteger) {
8071 IntegerType *IntCastTy =
8072 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8073 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8074 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8075 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8076 AO, Failure);
8077 } else {
8078 Result =
8079 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8080 }
8081
8082 if (V.Var) {
8083 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8084 if (!IsInteger)
8085 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8086 assert(OldValue->getType() == V.ElemTy &&
8087 "OldValue and V must be of same type");
8088 if (IsPostfixUpdate) {
8089 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8090 } else {
8091 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8092 if (IsFailOnly) {
8093 // CurBB----
8094 // | |
8095 // v |
8096 // ContBB |
8097 // | |
8098 // v |
8099 // ExitBB <-
8100 //
8101 // where ContBB only contains the store of old value to 'v'.
8103 Instruction *CurBBTI = CurBB->getTerminator();
8104 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8105 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8106 CurBBTI, X.Var->getName() + ".atomic.exit");
8107 BasicBlock *ContBB = CurBB->splitBasicBlock(
8108 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8109 ContBB->getTerminator()->eraseFromParent();
8110 CurBB->getTerminator()->eraseFromParent();
8111
8112 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8113
8114 Builder.SetInsertPoint(ContBB);
8115 Builder.CreateStore(OldValue, V.Var);
8116 Builder.CreateBr(ExitBB);
8117
8118 if (UnreachableInst *ExitTI =
8119 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8120 CurBBTI->eraseFromParent();
8121 Builder.SetInsertPoint(ExitBB);
8122 } else {
8123 Builder.SetInsertPoint(ExitTI);
8124 }
8125 } else {
8126 Value *CapturedValue =
8127 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8128 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8129 }
8130 }
8131 }
8132 // The comparison result has to be stored.
8133 if (R.Var) {
8134 assert(R.Var->getType()->isPointerTy() &&
8135 "r.var must be of pointer type");
8136 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8137
8138 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8139 Value *ResultCast = R.IsSigned
8140 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8141 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8142 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8143 }
8144 } else {
8145 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8146 "Op should be either max or min at this point");
8147 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8148
8149 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8150 // Let's take max as example.
8151 // OpenMP form:
8152 // x = x > expr ? expr : x;
8153 // LLVM form:
8154 // *ptr = *ptr > val ? *ptr : val;
8155 // We need to transform to LLVM form.
8156 // x = x <= expr ? x : expr;
8158 if (IsXBinopExpr) {
8159 if (IsInteger) {
8160 if (X.IsSigned)
8161 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8163 else
8164 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8166 } else {
8167 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8169 }
8170 } else {
8171 if (IsInteger) {
8172 if (X.IsSigned)
8173 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8175 else
8176 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8178 } else {
8179 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8181 }
8182 }
8183
8184 AtomicRMWInst *OldValue =
8185 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8186 if (V.Var) {
8187 Value *CapturedValue = nullptr;
8188 if (IsPostfixUpdate) {
8189 CapturedValue = OldValue;
8190 } else {
8191 CmpInst::Predicate Pred;
8192 switch (NewOp) {
8193 case AtomicRMWInst::Max:
8194 Pred = CmpInst::ICMP_SGT;
8195 break;
8197 Pred = CmpInst::ICMP_UGT;
8198 break;
8200 Pred = CmpInst::FCMP_OGT;
8201 break;
8202 case AtomicRMWInst::Min:
8203 Pred = CmpInst::ICMP_SLT;
8204 break;
8206 Pred = CmpInst::ICMP_ULT;
8207 break;
8209 Pred = CmpInst::FCMP_OLT;
8210 break;
8211 default:
8212 llvm_unreachable("unexpected comparison op");
8213 }
8214 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8215 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8216 }
8217 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8218 }
8219 }
8220
8221 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8222
8223 return Builder.saveIP();
8224}
8225
8228 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8229 Value *NumTeamsUpper, Value *ThreadLimit,
8230 Value *IfExpr) {
8231 if (!updateToLocation(Loc))
8232 return InsertPointTy();
8233
8234 uint32_t SrcLocStrSize;
8235 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8236 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8237 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8238
8239 // Outer allocation basicblock is the entry block of the current function.
8240 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8241 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8242 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8243 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8244 }
8245
8246 // The current basic block is split into four basic blocks. After outlining,
8247 // they will be mapped as follows:
8248 // ```
8249 // def current_fn() {
8250 // current_basic_block:
8251 // br label %teams.exit
8252 // teams.exit:
8253 // ; instructions after teams
8254 // }
8255 //
8256 // def outlined_fn() {
8257 // teams.alloca:
8258 // br label %teams.body
8259 // teams.body:
8260 // ; instructions within teams body
8261 // }
8262 // ```
8263 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8264 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8265 BasicBlock *AllocaBB =
8266 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8267
8268 bool SubClausesPresent =
8269 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8270 // Push num_teams
8271 if (!Config.isTargetDevice() && SubClausesPresent) {
8272 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8273 "if lowerbound is non-null, then upperbound must also be non-null "
8274 "for bounds on num_teams");
8275
8276 if (NumTeamsUpper == nullptr)
8277 NumTeamsUpper = Builder.getInt32(0);
8278
8279 if (NumTeamsLower == nullptr)
8280 NumTeamsLower = NumTeamsUpper;
8281
8282 if (IfExpr) {
8283 assert(IfExpr->getType()->isIntegerTy() &&
8284 "argument to if clause must be an integer value");
8285
8286 // upper = ifexpr ? upper : 1
8287 if (IfExpr->getType() != Int1)
8288 IfExpr = Builder.CreateICmpNE(IfExpr,
8289 ConstantInt::get(IfExpr->getType(), 0));
8290 NumTeamsUpper = Builder.CreateSelect(
8291 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8292
8293 // lower = ifexpr ? lower : 1
8294 NumTeamsLower = Builder.CreateSelect(
8295 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8296 }
8297
8298 if (ThreadLimit == nullptr)
8299 ThreadLimit = Builder.getInt32(0);
8300
8301 Value *ThreadNum = getOrCreateThreadID(Ident);
8303 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
8304 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
8305 }
8306 // Generate the body of teams.
8307 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
8308 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
8309 BodyGenCB(AllocaIP, CodeGenIP);
8310
8311 OutlineInfo OI;
8312 OI.EntryBB = AllocaBB;
8313 OI.ExitBB = ExitBB;
8314 OI.OuterAllocaBB = &OuterAllocaBB;
8315
8316 // Insert fake values for global tid and bound tid.
8318 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
8320 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
8322 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
8323
8324 auto HostPostOutlineCB = [this, Ident,
8325 ToBeDeleted](Function &OutlinedFn) mutable {
8326 // The stale call instruction will be replaced with a new call instruction
8327 // for runtime call with the outlined function.
8328
8329 assert(OutlinedFn.getNumUses() == 1 &&
8330 "there must be a single user for the outlined function");
8331 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8332 ToBeDeleted.push_back(StaleCI);
8333
8334 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
8335 "Outlined function must have two or three arguments only");
8336
8337 bool HasShared = OutlinedFn.arg_size() == 3;
8338
8339 OutlinedFn.getArg(0)->setName("global.tid.ptr");
8340 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
8341 if (HasShared)
8342 OutlinedFn.getArg(2)->setName("data");
8343
8344 // Call to the runtime function for teams in the current function.
8345 assert(StaleCI && "Error while outlining - no CallInst user found for the "
8346 "outlined function.");
8347 Builder.SetInsertPoint(StaleCI);
8348 SmallVector<Value *> Args = {
8349 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
8350 if (HasShared)
8351 Args.push_back(StaleCI->getArgOperand(2));
8353 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
8354 Args);
8355
8356 for (Instruction *I : llvm::reverse(ToBeDeleted))
8357 I->eraseFromParent();
8358 };
8359
8360 if (!Config.isTargetDevice())
8361 OI.PostOutlineCB = HostPostOutlineCB;
8362
8363 addOutlineInfo(std::move(OI));
8364
8365 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
8366
8367 return Builder.saveIP();
8368}
8369
8372 std::string VarName) {
8373 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
8375 Names.size()),
8376 Names);
8377 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
8378 M, MapNamesArrayInit->getType(),
8379 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
8380 VarName);
8381 return MapNamesArrayGlobal;
8382}
8383
8384// Create all simple and struct types exposed by the runtime and remember
8385// the llvm::PointerTypes of them for easy access later.
8386void OpenMPIRBuilder::initializeTypes(Module &M) {
8387 LLVMContext &Ctx = M.getContext();
8388 StructType *T;
8389#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
8390#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
8391 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
8392 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
8393#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
8394 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
8395 VarName##Ptr = PointerType::getUnqual(VarName);
8396#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
8397 T = StructType::getTypeByName(Ctx, StructName); \
8398 if (!T) \
8399 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
8400 VarName = T; \
8401 VarName##Ptr = PointerType::getUnqual(T);
8402#include "llvm/Frontend/OpenMP/OMPKinds.def"
8403}
8404
8407 SmallVectorImpl<BasicBlock *> &BlockVector) {
8409 BlockSet.insert(EntryBB);
8410 BlockSet.insert(ExitBB);
8411
8412 Worklist.push_back(EntryBB);
8413 while (!Worklist.empty()) {
8414 BasicBlock *BB = Worklist.pop_back_val();
8415 BlockVector.push_back(BB);
8416 for (BasicBlock *SuccBB : successors(BB))
8417 if (BlockSet.insert(SuccBB).second)
8418 Worklist.push_back(SuccBB);
8419 }
8420}
8421
8423 uint64_t Size, int32_t Flags,
8425 StringRef Name) {
8426 if (!Config.isGPU()) {
8428 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
8429 "omp_offloading_entries");
8430 return;
8431 }
8432 // TODO: Add support for global variables on the device after declare target
8433 // support.
8434 Function *Fn = dyn_cast<Function>(Addr);
8435 if (!Fn)
8436 return;
8437
8438 Module &M = *(Fn->getParent());
8439 LLVMContext &Ctx = M.getContext();
8440
8441 // Get "nvvm.annotations" metadata node.
8442 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
8443
8444 Metadata *MDVals[] = {
8445 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
8446 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
8447 // Append metadata to nvvm.annotations.
8448 MD->addOperand(MDNode::get(Ctx, MDVals));
8449
8450 // Add a function attribute for the kernel.
8451 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
8452 if (T.isAMDGCN())
8453 Fn->addFnAttr("uniform-work-group-size", "true");
8454 Fn->addFnAttr(Attribute::MustProgress);
8455}
8456
8457// We only generate metadata for function that contain target regions.
8460
8461 // If there are no entries, we don't need to do anything.
8463 return;
8464
8468 16>
8469 OrderedEntries(OffloadInfoManager.size());
8470
8471 // Auxiliary methods to create metadata values and strings.
8472 auto &&GetMDInt = [this](unsigned V) {
8473 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
8474 };
8475
8476 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
8477
8478 // Create the offloading info metadata node.
8479 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
8480 auto &&TargetRegionMetadataEmitter =
8481 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
8482 const TargetRegionEntryInfo &EntryInfo,
8484 // Generate metadata for target regions. Each entry of this metadata
8485 // contains:
8486 // - Entry 0 -> Kind of this type of metadata (0).
8487 // - Entry 1 -> Device ID of the file where the entry was identified.
8488 // - Entry 2 -> File ID of the file where the entry was identified.
8489 // - Entry 3 -> Mangled name of the function where the entry was
8490 // identified.
8491 // - Entry 4 -> Line in the file where the entry was identified.
8492 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
8493 // - Entry 6 -> Order the entry was created.
8494 // The first element of the metadata node is the kind.
8495 Metadata *Ops[] = {
8496 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
8497 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
8498 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
8499 GetMDInt(E.getOrder())};
8500
8501 // Save this entry in the right position of the ordered entries array.
8502 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
8503
8504 // Add metadata to the named metadata node.
8505 MD->addOperand(MDNode::get(C, Ops));
8506 };
8507
8508 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
8509
8510 // Create function that emits metadata for each device global variable entry;
8511 auto &&DeviceGlobalVarMetadataEmitter =
8512 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
8513 StringRef MangledName,
8515 // Generate metadata for global variables. Each entry of this metadata
8516 // contains:
8517 // - Entry 0 -> Kind of this type of metadata (1).
8518 // - Entry 1 -> Mangled name of the variable.
8519 // - Entry 2 -> Declare target kind.
8520 // - Entry 3 -> Order the entry was created.
8521 // The first element of the metadata node is the kind.
8522 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
8523 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
8524
8525 // Save this entry in the right position of the ordered entries array.
8526 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
8527 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
8528
8529 // Add metadata to the named metadata node.
8530 MD->addOperand(MDNode::get(C, Ops));
8531 };
8532
8534 DeviceGlobalVarMetadataEmitter);
8535
8536 for (const auto &E : OrderedEntries) {
8537 assert(E.first && "All ordered entries must exist!");
8538 if (const auto *CE =
8539 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
8540 E.first)) {
8541 if (!CE->getID() || !CE->getAddress()) {
8542 // Do not blame the entry if the parent funtion is not emitted.
8543 TargetRegionEntryInfo EntryInfo = E.second;
8544 StringRef FnName = EntryInfo.ParentName;
8545 if (!M.getNamedValue(FnName))
8546 continue;
8547 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
8548 continue;
8549 }
8550 createOffloadEntry(CE->getID(), CE->getAddress(),
8551 /*Size=*/0, CE->getFlags(),
8553 } else if (const auto *CE = dyn_cast<
8555 E.first)) {
8558 CE->getFlags());
8559 switch (Flags) {
8563 continue;
8564 if (!CE->getAddress()) {
8565 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
8566 continue;
8567 }
8568 // The vaiable has no definition - no need to add the entry.
8569 if (CE->getVarSize() == 0)
8570 continue;
8571 break;
8573 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
8574 (!Config.isTargetDevice() && CE->getAddress())) &&
8575 "Declaret target link address is set.");
8576 if (Config.isTargetDevice())
8577 continue;
8578 if (!CE->getAddress()) {
8580 continue;
8581 }
8582 break;
8583 default:
8584 break;
8585 }
8586
8587 // Hidden or internal symbols on the device are not externally visible.
8588 // We should not attempt to register them by creating an offloading
8589 // entry. Indirect variables are handled separately on the device.
8590 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
8591 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
8593 continue;
8594
8595 // Indirect globals need to use a special name that doesn't match the name
8596 // of the associated host global.
8598 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8599 Flags, CE->getLinkage(), CE->getVarName());
8600 else
8601 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8602 Flags, CE->getLinkage());
8603
8604 } else {
8605 llvm_unreachable("Unsupported entry kind.");
8606 }
8607 }
8608
8609 // Emit requires directive globals to a special entry so the runtime can
8610 // register them when the device image is loaded.
8611 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
8612 // entries should be redesigned to better suit this use-case.
8616 /*Name=*/"",
8618 Config.getRequiresFlags(), "omp_offloading_entries");
8619}
8620
8622 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
8623 unsigned FileID, unsigned Line, unsigned Count) {
8625 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
8626 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
8627 if (Count)
8628 OS << "_" << Count;
8629}
8630
8633 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
8635 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
8636 EntryInfo.Line, NewCount);
8637}
8638
8641 StringRef ParentName) {
8643 auto FileIDInfo = CallBack();
8644 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
8645 report_fatal_error(("Unable to get unique ID for file, during "
8646 "getTargetEntryUniqueInfo, error message: " +
8647 EC.message())
8648 .c_str());
8649 }
8650
8651 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
8652 std::get<1>(FileIDInfo));
8653}
8654
8656 unsigned Offset = 0;
8657 for (uint64_t Remain =
8658 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8660 !(Remain & 1); Remain = Remain >> 1)
8661 Offset++;
8662 return Offset;
8663}
8664
8667 // Rotate by getFlagMemberOffset() bits.
8668 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
8669 << getFlagMemberOffset());
8670}
8671
8674 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
8675 // If the entry is PTR_AND_OBJ but has not been marked with the special
8676 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
8677 // marked as MEMBER_OF.
8678 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8680 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8683 return;
8684
8685 // Reset the placeholder value to prepare the flag for the assignment of the
8686 // proper MEMBER_OF value.
8687 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
8688 Flags |= MemberOfFlag;
8689}
8690
8694 bool IsDeclaration, bool IsExternallyVisible,
8695 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8696 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8697 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
8698 std::function<Constant *()> GlobalInitializer,
8699 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
8700 // TODO: convert this to utilise the IRBuilder Config rather than
8701 // a passed down argument.
8702 if (OpenMPSIMD)
8703 return nullptr;
8704
8707 CaptureClause ==
8710 SmallString<64> PtrName;
8711 {
8712 raw_svector_ostream OS(PtrName);
8713 OS << MangledName;
8714 if (!IsExternallyVisible)
8715 OS << format("_%x", EntryInfo.FileID);
8716 OS << "_decl_tgt_ref_ptr";
8717 }
8718
8719 Value *Ptr = M.getNamedValue(PtrName);
8720
8721 if (!Ptr) {
8722 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
8723 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
8724
8725 auto *GV = cast<GlobalVariable>(Ptr);
8726 GV->setLinkage(GlobalValue::WeakAnyLinkage);
8727
8728 if (!Config.isTargetDevice()) {
8729 if (GlobalInitializer)
8730 GV->setInitializer(GlobalInitializer());
8731 else
8732 GV->setInitializer(GlobalValue);
8733 }
8734
8736 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8737 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8738 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
8739 }
8740
8741 return cast<Constant>(Ptr);
8742 }
8743
8744 return nullptr;
8745}
8746
8750 bool IsDeclaration, bool IsExternallyVisible,
8751 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8752 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8753 std::vector<Triple> TargetTriple,
8754 std::function<Constant *()> GlobalInitializer,
8755 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
8756 Constant *Addr) {
8758 (TargetTriple.empty() && !Config.isTargetDevice()))
8759 return;
8760
8762 StringRef VarName;
8763 int64_t VarSize;
8765
8767 CaptureClause ==
8771 VarName = MangledName;
8772 GlobalValue *LlvmVal = M.getNamedValue(VarName);
8773
8774 if (!IsDeclaration)
8775 VarSize = divideCeil(
8777 else
8778 VarSize = 0;
8779 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
8780
8781 // This is a workaround carried over from Clang which prevents undesired
8782 // optimisation of internal variables.
8783 if (Config.isTargetDevice() &&
8784 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
8785 // Do not create a "ref-variable" if the original is not also available
8786 // on the host.
8788 return;
8789
8790 std::string RefName = createPlatformSpecificName({VarName, "ref"});
8791
8792 if (!M.getNamedValue(RefName)) {
8793 Constant *AddrRef =
8794 getOrCreateInternalVariable(Addr->getType(), RefName);
8795 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
8796 GvAddrRef->setConstant(true);
8797 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
8798 GvAddrRef->setInitializer(Addr);
8799 GeneratedRefs.push_back(GvAddrRef);
8800 }
8801 }
8802 } else {
8805 else
8807
8808 if (Config.isTargetDevice()) {
8809 VarName = (Addr) ? Addr->getName() : "";
8810 Addr = nullptr;
8811 } else {
8813 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8814 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8815 LlvmPtrTy, GlobalInitializer, VariableLinkage);
8816 VarName = (Addr) ? Addr->getName() : "";
8817 }
8818 VarSize = M.getDataLayout().getPointerSize();
8820 }
8821
8823 Flags, Linkage);
8824}
8825
8826/// Loads all the offload entries information from the host IR
8827/// metadata.
8829 // If we are in target mode, load the metadata from the host IR. This code has
8830 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
8831
8833 if (!MD)
8834 return;
8835
8836 for (MDNode *MN : MD->operands()) {
8837 auto &&GetMDInt = [MN](unsigned Idx) {
8838 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
8839 return cast<ConstantInt>(V->getValue())->getZExtValue();
8840 };
8841
8842 auto &&GetMDString = [MN](unsigned Idx) {
8843 auto *V = cast<MDString>(MN->getOperand(Idx));
8844 return V->getString();
8845 };
8846
8847 switch (GetMDInt(0)) {
8848 default:
8849 llvm_unreachable("Unexpected metadata!");
8850 break;
8853 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
8854 /*DeviceID=*/GetMDInt(1),
8855 /*FileID=*/GetMDInt(2),
8856 /*Line=*/GetMDInt(4),
8857 /*Count=*/GetMDInt(5));
8859 /*Order=*/GetMDInt(6));
8860 break;
8861 }
8865 /*MangledName=*/GetMDString(1),
8867 /*Flags=*/GetMDInt(2)),
8868 /*Order=*/GetMDInt(3));
8869 break;
8870 }
8871 }
8872}
8873
8875 if (HostFilePath.empty())
8876 return;
8877
8878 auto Buf = MemoryBuffer::getFile(HostFilePath);
8879 if (std::error_code Err = Buf.getError()) {
8880 report_fatal_error(("error opening host file from host file path inside of "
8881 "OpenMPIRBuilder: " +
8882 Err.message())
8883 .c_str());
8884 }
8885
8886 LLVMContext Ctx;
8888 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
8889 if (std::error_code Err = M.getError()) {
8891 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
8892 .c_str());
8893 }
8894
8895 loadOffloadInfoMetadata(*M.get());
8896}
8897
8898//===----------------------------------------------------------------------===//
8899// OffloadEntriesInfoManager
8900//===----------------------------------------------------------------------===//
8901
8903 return OffloadEntriesTargetRegion.empty() &&
8904 OffloadEntriesDeviceGlobalVar.empty();
8905}
8906
8907unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
8908 const TargetRegionEntryInfo &EntryInfo) const {
8909 auto It = OffloadEntriesTargetRegionCount.find(
8910 getTargetRegionEntryCountKey(EntryInfo));
8911 if (It == OffloadEntriesTargetRegionCount.end())
8912 return 0;
8913 return It->second;
8914}
8915
8916void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
8917 const TargetRegionEntryInfo &EntryInfo) {
8918 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
8919 EntryInfo.Count + 1;
8920}
8921
8922/// Initialize target region entry.
8924 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
8925 OffloadEntriesTargetRegion[EntryInfo] =
8926 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
8927 OMPTargetRegionEntryTargetRegion);
8928 ++OffloadingEntriesNum;
8929}
8930
8934 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
8935
8936 // Update the EntryInfo with the next available count for this location.
8937 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8938
8939 // If we are emitting code for a target, the entry is already initialized,
8940 // only has to be registered.
8941 if (OMPBuilder->Config.isTargetDevice()) {
8942 // This could happen if the device compilation is invoked standalone.
8943 if (!hasTargetRegionEntryInfo(EntryInfo)) {
8944 return;
8945 }
8946 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
8947 Entry.setAddress(Addr);
8948 Entry.setID(ID);
8949 Entry.setFlags(Flags);
8950 } else {
8952 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
8953 return;
8954 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
8955 "Target region entry already registered!");
8956 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
8957 OffloadEntriesTargetRegion[EntryInfo] = Entry;
8958 ++OffloadingEntriesNum;
8959 }
8960 incrementTargetRegionEntryInfoCount(EntryInfo);
8961}
8962
8964 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
8965
8966 // Update the EntryInfo with the next available count for this location.
8967 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8968
8969 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
8970 if (It == OffloadEntriesTargetRegion.end()) {
8971 return false;
8972 }
8973 // Fail if this entry is already registered.
8974 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
8975 return false;
8976 return true;
8977}
8978
8980 const OffloadTargetRegionEntryInfoActTy &Action) {
8981 // Scan all target region entries and perform the provided action.
8982 for (const auto &It : OffloadEntriesTargetRegion) {
8983 Action(It.first, It.second);
8984 }
8985}
8986
8988 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
8989 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
8990 ++OffloadingEntriesNum;
8991}
8992
8994 StringRef VarName, Constant *Addr, int64_t VarSize,
8996 if (OMPBuilder->Config.isTargetDevice()) {
8997 // This could happen if the device compilation is invoked standalone.
8998 if (!hasDeviceGlobalVarEntryInfo(VarName))
8999 return;
9000 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9001 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9002 if (Entry.getVarSize() == 0) {
9003 Entry.setVarSize(VarSize);
9004 Entry.setLinkage(Linkage);
9005 }
9006 return;
9007 }
9008 Entry.setVarSize(VarSize);
9009 Entry.setLinkage(Linkage);
9010 Entry.setAddress(Addr);
9011 } else {
9012 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9013 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9014 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9015 "Entry not initialized!");
9016 if (Entry.getVarSize() == 0) {
9017 Entry.setVarSize(VarSize);
9018 Entry.setLinkage(Linkage);
9019 }
9020 return;
9021 }
9023 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9024 Addr, VarSize, Flags, Linkage,
9025 VarName.str());
9026 else
9027 OffloadEntriesDeviceGlobalVar.try_emplace(
9028 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9029 ++OffloadingEntriesNum;
9030 }
9031}
9032
9035 // Scan all target region entries and perform the provided action.
9036 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9037 Action(E.getKey(), E.getValue());
9038}
9039
9040//===----------------------------------------------------------------------===//
9041// CanonicalLoopInfo
9042//===----------------------------------------------------------------------===//
9043
9044void CanonicalLoopInfo::collectControlBlocks(
9046 // We only count those BBs as control block for which we do not need to
9047 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9048 // flow. For consistency, this also means we do not add the Body block, which
9049 // is just the entry to the body code.
9050 BBs.reserve(BBs.size() + 6);
9051 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9052}
9053
9055 assert(isValid() && "Requires a valid canonical loop");
9056 for (BasicBlock *Pred : predecessors(Header)) {
9057 if (Pred != Latch)
9058 return Pred;
9059 }
9060 llvm_unreachable("Missing preheader");
9061}
9062
9063void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9064 assert(isValid() && "Requires a valid canonical loop");
9065
9066 Instruction *CmpI = &getCond()->front();
9067 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9068 CmpI->setOperand(1, TripCount);
9069
9070#ifndef NDEBUG
9071 assertOK();
9072#endif
9073}
9074
9075void CanonicalLoopInfo::mapIndVar(
9076 llvm::function_ref<Value *(Instruction *)> Updater) {
9077 assert(isValid() && "Requires a valid canonical loop");
9078
9079 Instruction *OldIV = getIndVar();
9080
9081 // Record all uses excluding those introduced by the updater. Uses by the
9082 // CanonicalLoopInfo itself to keep track of the number of iterations are
9083 // excluded.
9084 SmallVector<Use *> ReplacableUses;
9085 for (Use &U : OldIV->uses()) {
9086 auto *User = dyn_cast<Instruction>(U.getUser());
9087 if (!User)
9088 continue;
9089 if (User->getParent() == getCond())
9090 continue;
9091 if (User->getParent() == getLatch())
9092 continue;
9093 ReplacableUses.push_back(&U);
9094 }
9095
9096 // Run the updater that may introduce new uses
9097 Value *NewIV = Updater(OldIV);
9098
9099 // Replace the old uses with the value returned by the updater.
9100 for (Use *U : ReplacableUses)
9101 U->set(NewIV);
9102
9103#ifndef NDEBUG
9104 assertOK();
9105#endif
9106}
9107
9109#ifndef NDEBUG
9110 // No constraints if this object currently does not describe a loop.
9111 if (!isValid())
9112 return;
9113
9114 BasicBlock *Preheader = getPreheader();
9115 BasicBlock *Body = getBody();
9116 BasicBlock *After = getAfter();
9117
9118 // Verify standard control-flow we use for OpenMP loops.
9119 assert(Preheader);
9120 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9121 "Preheader must terminate with unconditional branch");
9122 assert(Preheader->getSingleSuccessor() == Header &&
9123 "Preheader must jump to header");
9124
9125 assert(Header);
9126 assert(isa<BranchInst>(Header->getTerminator()) &&
9127 "Header must terminate with unconditional branch");
9128 assert(Header->getSingleSuccessor() == Cond &&
9129 "Header must jump to exiting block");
9130
9131 assert(Cond);
9132 assert(Cond->getSinglePredecessor() == Header &&
9133 "Exiting block only reachable from header");
9134
9135 assert(isa<BranchInst>(Cond->getTerminator()) &&
9136 "Exiting block must terminate with conditional branch");
9137 assert(size(successors(Cond)) == 2 &&
9138 "Exiting block must have two successors");
9139 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9140 "Exiting block's first successor jump to the body");
9141 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9142 "Exiting block's second successor must exit the loop");
9143
9144 assert(Body);
9145 assert(Body->getSinglePredecessor() == Cond &&
9146 "Body only reachable from exiting block");
9147 assert(!isa<PHINode>(Body->front()));
9148
9149 assert(Latch);
9150 assert(isa<BranchInst>(Latch->getTerminator()) &&
9151 "Latch must terminate with unconditional branch");
9152 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9153 // TODO: To support simple redirecting of the end of the body code that has
9154 // multiple; introduce another auxiliary basic block like preheader and after.
9155 assert(Latch->getSinglePredecessor() != nullptr);
9156 assert(!isa<PHINode>(Latch->front()));
9157
9158 assert(Exit);
9159 assert(isa<BranchInst>(Exit->getTerminator()) &&
9160 "Exit block must terminate with unconditional branch");
9161 assert(Exit->getSingleSuccessor() == After &&
9162 "Exit block must jump to after block");
9163
9164 assert(After);
9165 assert(After->getSinglePredecessor() == Exit &&
9166 "After block only reachable from exit block");
9167 assert(After->empty() || !isa<PHINode>(After->front()));
9168
9169 Instruction *IndVar = getIndVar();
9170 assert(IndVar && "Canonical induction variable not found?");
9171 assert(isa<IntegerType>(IndVar->getType()) &&
9172 "Induction variable must be an integer");
9173 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9174 "Induction variable must be a PHI in the loop header");
9175 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9176 assert(
9177 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9178 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9179
9180 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9181 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9182 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9183 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9184 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9185 ->isOne());
9186
9187 Value *TripCount = getTripCount();
9188 assert(TripCount && "Loop trip count not found?");
9189 assert(IndVar->getType() == TripCount->getType() &&
9190 "Trip count and induction variable must have the same type");
9191
9192 auto *CmpI = cast<CmpInst>(&Cond->front());
9193 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9194 "Exit condition must be a signed less-than comparison");
9195 assert(CmpI->getOperand(0) == IndVar &&
9196 "Exit condition must compare the induction variable");
9197 assert(CmpI->getOperand(1) == TripCount &&
9198 "Exit condition must compare with the trip count");
9199#endif
9200}
9201
9203 Header = nullptr;
9204 Cond = nullptr;
9205 Latch = nullptr;
9206 Exit = nullptr;
9207}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Function * createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static void emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={})
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:61
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:122
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:97
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:115
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:102
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:126
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:93
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:467
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
Class to represent array types.
Definition: DerivedTypes.h:371
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:635
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:644
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ FSub
*p = old - v
Definition: Instructions.h:736
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:748
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:752
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:577
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:866
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:851
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1385
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1391
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:45
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:84
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2950
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:706
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2227
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2242
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1800
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Debug location.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:251
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:490
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:233
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:461
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:725
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:377
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:621
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:429
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:842
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
Class to represent function types.
Definition: DerivedTypes.h:103
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:653
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:172
const BasicBlock & getEntryBlock() const
Definition: Function.h:807
bool empty() const
Definition: Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:214
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:465
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:384
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:769
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:781
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:357
const Function & getFunction() const
Definition: Function.h:170
iterator begin()
Definition: Function.h:851
arg_iterator arg_begin()
Definition: Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:360
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:681
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:752
size_t arg_size() const
Definition: Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:219
iterator end()
Definition: Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:285
Argument * getArg(unsigned i) const
Definition: Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1528
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:254
BasicBlock * getBlock() const
Definition: IRBuilder.h:269
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:267
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:270
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1107
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2277
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1858
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1790
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2543
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1824
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2059
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1280
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2190
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2536
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1307
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1989
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:578
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2053
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1353
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1891
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2202
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1395
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:518
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1883
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1738
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:274
Constant * CreateGlobalStringPtr(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Same as CreateGlobalString, but return a pointer with "i8*" type instead of a pointer to array of i8.
Definition: IRBuilder.h:2012
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2386
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2417
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1160
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2261
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:494
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1137
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1807
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2041
LLVMContext & getContext() const
Definition: IRBuilder.h:173
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1492
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1107
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1930
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1976
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1820
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2569
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1871
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2027
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1514
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:566
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1131
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2293
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:478
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2216
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:286
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2564
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:561
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1843
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2432
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1473
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1536
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2371
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1421
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:656
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2074
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1378
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:938
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1642
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:120
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1550
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1542
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:262
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:299
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:282
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:295
iterator_range< global_iterator > globals()
Definition: Module.h:699
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:611
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:444
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:135
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:269
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:459
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:291
A tuple of MDNodes.
Definition: Metadata.h:1730
iterator_range< op_iterator > operands()
Definition: Metadata.h:1826
void addOperand(MDNode *M)
Definition: Metadata.cpp:1394
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:243
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:245
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:376
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:378
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:296
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:298
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:287
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:356
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:362
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:368
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:366
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:360
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:358
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:432
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:92
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:188
StringRef separator() const
Definition: OMPIRBuilder.h:174
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:164
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:105
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:147
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:141
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:184
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:473
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:519
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
InsertPointTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:499
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
InsertPointTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={})
Generator for '#omp target'.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP, SmallVector< OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
iterator end() const
Definition: SmallPtrSet.h:460
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
iterator begin() const
Definition: SmallPtrSet.h:455
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
void setAlignment(Align Align)
Definition: Instructions.h:333
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:360
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:685
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:436
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:601
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:501
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:953
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1011
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1021
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:127
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:143
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:827
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:70
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:788
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
AddressSpace
Definition: NVPTXBaseInfo.h:21
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2098
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:872
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:615
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:202
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61