LLVM 19.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
43#include "llvm/IR/Value.h"
55
56#include <cstdint>
57#include <optional>
58
59#define DEBUG_TYPE "openmp-ir-builder"
60
61using namespace llvm;
62using namespace omp;
63
64static cl::opt<bool>
65 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
66 cl::desc("Use optimistic attributes describing "
67 "'as-if' properties of runtime calls."),
68 cl::init(false));
69
71 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
72 cl::desc("Factor for the unroll threshold to account for code "
73 "simplifications still taking place"),
74 cl::init(1.5));
75
76#ifndef NDEBUG
77/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
78/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
79/// an InsertPoint stores the instruction before something is inserted. For
80/// instance, if both point to the same instruction, two IRBuilders alternating
81/// creating instruction will cause the instructions to be interleaved.
84 if (!IP1.isSet() || !IP2.isSet())
85 return false;
86 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
87}
88
90 // Valid ordered/unordered and base algorithm combinations.
91 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
92 case OMPScheduleType::UnorderedStaticChunked:
93 case OMPScheduleType::UnorderedStatic:
94 case OMPScheduleType::UnorderedDynamicChunked:
95 case OMPScheduleType::UnorderedGuidedChunked:
96 case OMPScheduleType::UnorderedRuntime:
97 case OMPScheduleType::UnorderedAuto:
98 case OMPScheduleType::UnorderedTrapezoidal:
99 case OMPScheduleType::UnorderedGreedy:
100 case OMPScheduleType::UnorderedBalanced:
101 case OMPScheduleType::UnorderedGuidedIterativeChunked:
102 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
103 case OMPScheduleType::UnorderedSteal:
104 case OMPScheduleType::UnorderedStaticBalancedChunked:
105 case OMPScheduleType::UnorderedGuidedSimd:
106 case OMPScheduleType::UnorderedRuntimeSimd:
107 case OMPScheduleType::OrderedStaticChunked:
108 case OMPScheduleType::OrderedStatic:
109 case OMPScheduleType::OrderedDynamicChunked:
110 case OMPScheduleType::OrderedGuidedChunked:
111 case OMPScheduleType::OrderedRuntime:
112 case OMPScheduleType::OrderedAuto:
113 case OMPScheduleType::OrderdTrapezoidal:
114 case OMPScheduleType::NomergeUnorderedStaticChunked:
115 case OMPScheduleType::NomergeUnorderedStatic:
116 case OMPScheduleType::NomergeUnorderedDynamicChunked:
117 case OMPScheduleType::NomergeUnorderedGuidedChunked:
118 case OMPScheduleType::NomergeUnorderedRuntime:
119 case OMPScheduleType::NomergeUnorderedAuto:
120 case OMPScheduleType::NomergeUnorderedTrapezoidal:
121 case OMPScheduleType::NomergeUnorderedGreedy:
122 case OMPScheduleType::NomergeUnorderedBalanced:
123 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
124 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
125 case OMPScheduleType::NomergeUnorderedSteal:
126 case OMPScheduleType::NomergeOrderedStaticChunked:
127 case OMPScheduleType::NomergeOrderedStatic:
128 case OMPScheduleType::NomergeOrderedDynamicChunked:
129 case OMPScheduleType::NomergeOrderedGuidedChunked:
130 case OMPScheduleType::NomergeOrderedRuntime:
131 case OMPScheduleType::NomergeOrderedAuto:
132 case OMPScheduleType::NomergeOrderedTrapezoidal:
133 break;
134 default:
135 return false;
136 }
137
138 // Must not set both monotonicity modifiers at the same time.
139 OMPScheduleType MonotonicityFlags =
140 SchedType & OMPScheduleType::MonotonicityMask;
141 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
142 return false;
143
144 return true;
145}
146#endif
147
148static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
149 if (T.isAMDGPU()) {
150 StringRef Features =
151 Kernel->getFnAttribute("target-features").getValueAsString();
152 if (Features.count("+wavefrontsize64"))
153 return omp::getAMDGPUGridValues<64>();
154 return omp::getAMDGPUGridValues<32>();
155 }
156 if (T.isNVPTX())
158 llvm_unreachable("No grid value available for this architecture!");
159}
160
161/// Determine which scheduling algorithm to use, determined from schedule clause
162/// arguments.
163static OMPScheduleType
164getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
165 bool HasSimdModifier) {
166 // Currently, the default schedule it static.
167 switch (ClauseKind) {
168 case OMP_SCHEDULE_Default:
169 case OMP_SCHEDULE_Static:
170 return HasChunks ? OMPScheduleType::BaseStaticChunked
171 : OMPScheduleType::BaseStatic;
172 case OMP_SCHEDULE_Dynamic:
173 return OMPScheduleType::BaseDynamicChunked;
174 case OMP_SCHEDULE_Guided:
175 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
176 : OMPScheduleType::BaseGuidedChunked;
177 case OMP_SCHEDULE_Auto:
179 case OMP_SCHEDULE_Runtime:
180 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
181 : OMPScheduleType::BaseRuntime;
182 }
183 llvm_unreachable("unhandled schedule clause argument");
184}
185
186/// Adds ordering modifier flags to schedule type.
187static OMPScheduleType
189 bool HasOrderedClause) {
190 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
191 OMPScheduleType::None &&
192 "Must not have ordering nor monotonicity flags already set");
193
194 OMPScheduleType OrderingModifier = HasOrderedClause
195 ? OMPScheduleType::ModifierOrdered
196 : OMPScheduleType::ModifierUnordered;
197 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
198
199 // Unsupported combinations
200 if (OrderingScheduleType ==
201 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
202 return OMPScheduleType::OrderedGuidedChunked;
203 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
204 OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedRuntime;
206
207 return OrderingScheduleType;
208}
209
210/// Adds monotonicity modifier flags to schedule type.
211static OMPScheduleType
213 bool HasSimdModifier, bool HasMonotonic,
214 bool HasNonmonotonic, bool HasOrderedClause) {
215 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
216 OMPScheduleType::None &&
217 "Must not have monotonicity flags already set");
218 assert((!HasMonotonic || !HasNonmonotonic) &&
219 "Monotonic and Nonmonotonic are contradicting each other");
220
221 if (HasMonotonic) {
222 return ScheduleType | OMPScheduleType::ModifierMonotonic;
223 } else if (HasNonmonotonic) {
224 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
225 } else {
226 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
227 // If the static schedule kind is specified or if the ordered clause is
228 // specified, and if the nonmonotonic modifier is not specified, the
229 // effect is as if the monotonic modifier is specified. Otherwise, unless
230 // the monotonic modifier is specified, the effect is as if the
231 // nonmonotonic modifier is specified.
232 OMPScheduleType BaseScheduleType =
233 ScheduleType & ~OMPScheduleType::ModifierMask;
234 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
235 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
236 HasOrderedClause) {
237 // The monotonic is used by default in openmp runtime library, so no need
238 // to set it.
239 return ScheduleType;
240 } else {
241 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
242 }
243 }
244}
245
246/// Determine the schedule type using schedule and ordering clause arguments.
247static OMPScheduleType
248computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
249 bool HasSimdModifier, bool HasMonotonicModifier,
250 bool HasNonmonotonicModifier, bool HasOrderedClause) {
251 OMPScheduleType BaseSchedule =
252 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
253 OMPScheduleType OrderedSchedule =
254 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
256 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
257 HasNonmonotonicModifier, HasOrderedClause);
258
260 return Result;
261}
262
263/// Make \p Source branch to \p Target.
264///
265/// Handles two situations:
266/// * \p Source already has an unconditional branch.
267/// * \p Source is a degenerate block (no terminator because the BB is
268/// the current head of the IR construction).
270 if (Instruction *Term = Source->getTerminator()) {
271 auto *Br = cast<BranchInst>(Term);
272 assert(!Br->isConditional() &&
273 "BB's terminator must be an unconditional branch (or degenerate)");
274 BasicBlock *Succ = Br->getSuccessor(0);
275 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
276 Br->setSuccessor(0, Target);
277 return;
278 }
279
280 auto *NewBr = BranchInst::Create(Target, Source);
281 NewBr->setDebugLoc(DL);
282}
283
285 bool CreateBranch) {
286 assert(New->getFirstInsertionPt() == New->begin() &&
287 "Target BB must not have PHI nodes");
288
289 // Move instructions to new block.
290 BasicBlock *Old = IP.getBlock();
291 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
292
293 if (CreateBranch)
294 BranchInst::Create(New, Old);
295}
296
297void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
299 BasicBlock *Old = Builder.GetInsertBlock();
300
301 spliceBB(Builder.saveIP(), New, CreateBranch);
302 if (CreateBranch)
303 Builder.SetInsertPoint(Old->getTerminator());
304 else
305 Builder.SetInsertPoint(Old);
306
307 // SetInsertPoint also updates the Builder's debug location, but we want to
308 // keep the one the Builder was configured to use.
310}
311
314 BasicBlock *Old = IP.getBlock();
316 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
317 Old->getParent(), Old->getNextNode());
318 spliceBB(IP, New, CreateBranch);
319 New->replaceSuccessorsPhiUsesWith(Old, New);
320 return New;
321}
322
323BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
326 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
327 if (CreateBranch)
328 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
329 else
330 Builder.SetInsertPoint(Builder.GetInsertBlock());
331 // SetInsertPoint also updates the Builder's debug location, but we want to
332 // keep the one the Builder was configured to use.
334 return New;
335}
336
337BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
340 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
341 if (CreateBranch)
342 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
343 else
344 Builder.SetInsertPoint(Builder.GetInsertBlock());
345 // SetInsertPoint also updates the Builder's debug location, but we want to
346 // keep the one the Builder was configured to use.
348 return New;
349}
350
352 llvm::Twine Suffix) {
353 BasicBlock *Old = Builder.GetInsertBlock();
354 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
355}
356
357// This function creates a fake integer value and a fake use for the integer
358// value. It returns the fake value created. This is useful in modeling the
359// extra arguments to the outlined functions.
361 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
362 std::stack<Instruction *> &ToBeDeleted,
363 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
364 const Twine &Name = "", bool AsPtr = true) {
365 Builder.restoreIP(OuterAllocaIP);
366 Instruction *FakeVal;
367 AllocaInst *FakeValAddr =
368 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
369 ToBeDeleted.push(FakeValAddr);
370
371 if (AsPtr) {
372 FakeVal = FakeValAddr;
373 } else {
374 FakeVal =
375 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
376 ToBeDeleted.push(FakeVal);
377 }
378
379 // Generate a fake use of this value
380 Builder.restoreIP(InnerAllocaIP);
381 Instruction *UseFakeVal;
382 if (AsPtr) {
383 UseFakeVal =
384 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
385 } else {
386 UseFakeVal =
387 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
388 }
389 ToBeDeleted.push(UseFakeVal);
390 return FakeVal;
391}
392
393//===----------------------------------------------------------------------===//
394// OpenMPIRBuilderConfig
395//===----------------------------------------------------------------------===//
396
397namespace {
399/// Values for bit flags for marking which requires clauses have been used.
400enum OpenMPOffloadingRequiresDirFlags {
401 /// flag undefined.
402 OMP_REQ_UNDEFINED = 0x000,
403 /// no requires directive present.
404 OMP_REQ_NONE = 0x001,
405 /// reverse_offload clause.
406 OMP_REQ_REVERSE_OFFLOAD = 0x002,
407 /// unified_address clause.
408 OMP_REQ_UNIFIED_ADDRESS = 0x004,
409 /// unified_shared_memory clause.
410 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
411 /// dynamic_allocators clause.
412 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
413 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
414};
415
416} // anonymous namespace
417
419 : RequiresFlags(OMP_REQ_UNDEFINED) {}
420
422 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
423 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
424 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
425 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
426 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
427 RequiresFlags(OMP_REQ_UNDEFINED) {
428 if (HasRequiresReverseOffload)
429 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
430 if (HasRequiresUnifiedAddress)
431 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
432 if (HasRequiresUnifiedSharedMemory)
433 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
434 if (HasRequiresDynamicAllocators)
435 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
436}
437
439 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
440}
441
443 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
448}
449
451 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
452}
453
455 return hasRequiresFlags() ? RequiresFlags
456 : static_cast<int64_t>(OMP_REQ_NONE);
457}
458
460 if (Value)
461 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
462 else
463 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
464}
465
467 if (Value)
468 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
469 else
470 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
471}
472
474 if (Value)
475 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
476 else
477 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
478}
479
481 if (Value)
482 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
483 else
484 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
485}
486
487//===----------------------------------------------------------------------===//
488// OpenMPIRBuilder
489//===----------------------------------------------------------------------===//
490
492 IRBuilderBase &Builder,
493 SmallVector<Value *> &ArgsVector) {
495 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
498 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
499
500 Value *NumTeams3D =
501 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
502 Value *NumThreads3D =
503 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
504
505 ArgsVector = {Version,
506 PointerNum,
507 KernelArgs.RTArgs.BasePointersArray,
508 KernelArgs.RTArgs.PointersArray,
509 KernelArgs.RTArgs.SizesArray,
510 KernelArgs.RTArgs.MapTypesArray,
511 KernelArgs.RTArgs.MapNamesArray,
512 KernelArgs.RTArgs.MappersArray,
513 KernelArgs.NumIterations,
514 Flags,
515 NumTeams3D,
516 NumThreads3D,
517 KernelArgs.DynCGGroupMem};
518}
519
521 LLVMContext &Ctx = Fn.getContext();
522
523 // Get the function's current attributes.
524 auto Attrs = Fn.getAttributes();
525 auto FnAttrs = Attrs.getFnAttrs();
526 auto RetAttrs = Attrs.getRetAttrs();
528 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
529 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
530
531 // Add AS to FnAS while taking special care with integer extensions.
532 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
533 bool Param = true) -> void {
534 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
535 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
536 if (HasSignExt || HasZeroExt) {
537 assert(AS.getNumAttributes() == 1 &&
538 "Currently not handling extension attr combined with others.");
539 if (Param) {
540 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
541 FnAS = FnAS.addAttribute(Ctx, AK);
542 } else if (auto AK =
543 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
544 FnAS = FnAS.addAttribute(Ctx, AK);
545 } else {
546 FnAS = FnAS.addAttributes(Ctx, AS);
547 }
548 };
549
550#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
551#include "llvm/Frontend/OpenMP/OMPKinds.def"
552
553 // Add attributes to the function declaration.
554 switch (FnID) {
555#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
556 case Enum: \
557 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
558 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
559 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
560 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
561 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
562 break;
563#include "llvm/Frontend/OpenMP/OMPKinds.def"
564 default:
565 // Attributes are optional.
566 break;
567 }
568}
569
572 FunctionType *FnTy = nullptr;
573 Function *Fn = nullptr;
574
575 // Try to find the declation in the module first.
576 switch (FnID) {
577#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
578 case Enum: \
579 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
580 IsVarArg); \
581 Fn = M.getFunction(Str); \
582 break;
583#include "llvm/Frontend/OpenMP/OMPKinds.def"
584 }
585
586 if (!Fn) {
587 // Create a new declaration if we need one.
588 switch (FnID) {
589#define OMP_RTL(Enum, Str, ...) \
590 case Enum: \
591 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
592 break;
593#include "llvm/Frontend/OpenMP/OMPKinds.def"
594 }
595
596 // Add information if the runtime function takes a callback function
597 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
598 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
599 LLVMContext &Ctx = Fn->getContext();
600 MDBuilder MDB(Ctx);
601 // Annotate the callback behavior of the runtime function:
602 // - The callback callee is argument number 2 (microtask).
603 // - The first two arguments of the callback callee are unknown (-1).
604 // - All variadic arguments to the runtime function are passed to the
605 // callback callee.
606 Fn->addMetadata(
607 LLVMContext::MD_callback,
609 2, {-1, -1}, /* VarArgsArePassed */ true)}));
610 }
611 }
612
613 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
614 << " with type " << *Fn->getFunctionType() << "\n");
615 addAttributes(FnID, *Fn);
616
617 } else {
618 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
619 << " with type " << *Fn->getFunctionType() << "\n");
620 }
621
622 assert(Fn && "Failed to create OpenMP runtime function");
623
624 return {FnTy, Fn};
625}
626
629 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
630 assert(Fn && "Failed to create OpenMP runtime function pointer");
631 return Fn;
632}
633
634void OpenMPIRBuilder::initialize() { initializeTypes(M); }
635
638 BasicBlock &EntryBlock = Function->getEntryBlock();
639 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
640
641 // Loop over blocks looking for constant allocas, skipping the entry block
642 // as any allocas there are already in the desired location.
643 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
644 Block++) {
645 for (auto Inst = Block->getReverseIterator()->begin();
646 Inst != Block->getReverseIterator()->end();) {
647 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
648 Inst++;
649 if (!isa<ConstantData>(AllocaInst->getArraySize()))
650 continue;
651 AllocaInst->moveBeforePreserving(MoveLocInst);
652 } else {
653 Inst++;
654 }
655 }
656 }
657}
658
660 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
662 SmallVector<OutlineInfo, 16> DeferredOutlines;
663 for (OutlineInfo &OI : OutlineInfos) {
664 // Skip functions that have not finalized yet; may happen with nested
665 // function generation.
666 if (Fn && OI.getFunction() != Fn) {
667 DeferredOutlines.push_back(OI);
668 continue;
669 }
670
671 ParallelRegionBlockSet.clear();
672 Blocks.clear();
673 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
674
675 Function *OuterFn = OI.getFunction();
676 CodeExtractorAnalysisCache CEAC(*OuterFn);
677 // If we generate code for the target device, we need to allocate
678 // struct for aggregate params in the device default alloca address space.
679 // OpenMP runtime requires that the params of the extracted functions are
680 // passed as zero address space pointers. This flag ensures that
681 // CodeExtractor generates correct code for extracted functions
682 // which are used by OpenMP runtime.
683 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
684 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
685 /* AggregateArgs */ true,
686 /* BlockFrequencyInfo */ nullptr,
687 /* BranchProbabilityInfo */ nullptr,
688 /* AssumptionCache */ nullptr,
689 /* AllowVarArgs */ true,
690 /* AllowAlloca */ true,
691 /* AllocaBlock*/ OI.OuterAllocaBB,
692 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
693
694 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
695 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
696 << " Exit: " << OI.ExitBB->getName() << "\n");
697 assert(Extractor.isEligible() &&
698 "Expected OpenMP outlining to be possible!");
699
700 for (auto *V : OI.ExcludeArgsFromAggregate)
701 Extractor.excludeArgFromAggregate(V);
702
703 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
704
705 // Forward target-cpu, target-features attributes to the outlined function.
706 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
707 if (TargetCpuAttr.isStringAttribute())
708 OutlinedFn->addFnAttr(TargetCpuAttr);
709
710 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
711 if (TargetFeaturesAttr.isStringAttribute())
712 OutlinedFn->addFnAttr(TargetFeaturesAttr);
713
714 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
715 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
716 assert(OutlinedFn->getReturnType()->isVoidTy() &&
717 "OpenMP outlined functions should not return a value!");
718
719 // For compability with the clang CG we move the outlined function after the
720 // one with the parallel region.
721 OutlinedFn->removeFromParent();
722 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
723
724 // Remove the artificial entry introduced by the extractor right away, we
725 // made our own entry block after all.
726 {
727 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
728 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
729 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
730 // Move instructions from the to-be-deleted ArtificialEntry to the entry
731 // basic block of the parallel region. CodeExtractor generates
732 // instructions to unwrap the aggregate argument and may sink
733 // allocas/bitcasts for values that are solely used in the outlined region
734 // and do not escape.
735 assert(!ArtificialEntry.empty() &&
736 "Expected instructions to add in the outlined region entry");
737 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
738 End = ArtificialEntry.rend();
739 It != End;) {
740 Instruction &I = *It;
741 It++;
742
743 if (I.isTerminator())
744 continue;
745
746 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
747 }
748
749 OI.EntryBB->moveBefore(&ArtificialEntry);
750 ArtificialEntry.eraseFromParent();
751 }
752 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
753 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
754
755 // Run a user callback, e.g. to add attributes.
756 if (OI.PostOutlineCB)
757 OI.PostOutlineCB(*OutlinedFn);
758 }
759
760 // Remove work items that have been completed.
761 OutlineInfos = std::move(DeferredOutlines);
762
763 // The createTarget functions embeds user written code into
764 // the target region which may inject allocas which need to
765 // be moved to the entry block of our target or risk malformed
766 // optimisations by later passes, this is only relevant for
767 // the device pass which appears to be a little more delicate
768 // when it comes to optimisations (however, we do not block on
769 // that here, it's up to the inserter to the list to do so).
770 // This notbaly has to occur after the OutlinedInfo candidates
771 // have been extracted so we have an end product that will not
772 // be implicitly adversely affected by any raises unless
773 // intentionally appended to the list.
774 // NOTE: This only does so for ConstantData, it could be extended
775 // to ConstantExpr's with further effort, however, they should
776 // largely be folded when they get here. Extending it to runtime
777 // defined/read+writeable allocation sizes would be non-trivial
778 // (need to factor in movement of any stores to variables the
779 // allocation size depends on, as well as the usual loads,
780 // otherwise it'll yield the wrong result after movement) and
781 // likely be more suitable as an LLVM optimisation pass.
784
785 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
786 [](EmitMetadataErrorKind Kind,
787 const TargetRegionEntryInfo &EntryInfo) -> void {
788 errs() << "Error of kind: " << Kind
789 << " when emitting offload entries and metadata during "
790 "OMPIRBuilder finalization \n";
791 };
792
795}
796
798 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
799}
800
803 auto *GV =
804 new GlobalVariable(M, I32Ty,
805 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
806 ConstantInt::get(I32Ty, Value), Name);
807 GV->setVisibility(GlobalValue::HiddenVisibility);
808
809 return GV;
810}
811
813 uint32_t SrcLocStrSize,
814 IdentFlag LocFlags,
815 unsigned Reserve2Flags) {
816 // Enable "C-mode".
817 LocFlags |= OMP_IDENT_FLAG_KMPC;
818
819 Constant *&Ident =
820 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
821 if (!Ident) {
823 Constant *IdentData[] = {I32Null,
824 ConstantInt::get(Int32, uint32_t(LocFlags)),
825 ConstantInt::get(Int32, Reserve2Flags),
826 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
827 Constant *Initializer =
828 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
829
830 // Look for existing encoding of the location + flags, not needed but
831 // minimizes the difference to the existing solution while we transition.
832 for (GlobalVariable &GV : M.globals())
833 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
834 if (GV.getInitializer() == Initializer)
835 Ident = &GV;
836
837 if (!Ident) {
838 auto *GV = new GlobalVariable(
839 M, OpenMPIRBuilder::Ident,
840 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
843 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
844 GV->setAlignment(Align(8));
845 Ident = GV;
846 }
847 }
848
850}
851
853 uint32_t &SrcLocStrSize) {
854 SrcLocStrSize = LocStr.size();
855 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
856 if (!SrcLocStr) {
857 Constant *Initializer =
859
860 // Look for existing encoding of the location, not needed but minimizes the
861 // difference to the existing solution while we transition.
862 for (GlobalVariable &GV : M.globals())
863 if (GV.isConstant() && GV.hasInitializer() &&
864 GV.getInitializer() == Initializer)
865 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
866
867 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
868 /* AddressSpace */ 0, &M);
869 }
870 return SrcLocStr;
871}
872
874 StringRef FileName,
875 unsigned Line, unsigned Column,
876 uint32_t &SrcLocStrSize) {
877 SmallString<128> Buffer;
878 Buffer.push_back(';');
879 Buffer.append(FileName);
880 Buffer.push_back(';');
881 Buffer.append(FunctionName);
882 Buffer.push_back(';');
883 Buffer.append(std::to_string(Line));
884 Buffer.push_back(';');
885 Buffer.append(std::to_string(Column));
886 Buffer.push_back(';');
887 Buffer.push_back(';');
888 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
889}
890
891Constant *
893 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
894 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
895}
896
898 uint32_t &SrcLocStrSize,
899 Function *F) {
900 DILocation *DIL = DL.get();
901 if (!DIL)
902 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
903 StringRef FileName = M.getName();
904 if (DIFile *DIF = DIL->getFile())
905 if (std::optional<StringRef> Source = DIF->getSource())
906 FileName = *Source;
907 StringRef Function = DIL->getScope()->getSubprogram()->getName();
908 if (Function.empty() && F)
909 Function = F->getName();
910 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
911 DIL->getColumn(), SrcLocStrSize);
912}
913
915 uint32_t &SrcLocStrSize) {
916 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
917 Loc.IP.getBlock()->getParent());
918}
919
921 return Builder.CreateCall(
922 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
923 "omp_global_thread_num");
924}
925
928 bool ForceSimpleCall, bool CheckCancelFlag) {
929 if (!updateToLocation(Loc))
930 return Loc.IP;
931 return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
932}
933
936 bool ForceSimpleCall, bool CheckCancelFlag) {
937 // Build call __kmpc_cancel_barrier(loc, thread_id) or
938 // __kmpc_barrier(loc, thread_id);
939
940 IdentFlag BarrierLocFlags;
941 switch (Kind) {
942 case OMPD_for:
943 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
944 break;
945 case OMPD_sections:
946 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
947 break;
948 case OMPD_single:
949 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
950 break;
951 case OMPD_barrier:
952 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
953 break;
954 default:
955 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
956 break;
957 }
958
959 uint32_t SrcLocStrSize;
960 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
961 Value *Args[] = {
962 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
963 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
964
965 // If we are in a cancellable parallel region, barriers are cancellation
966 // points.
967 // TODO: Check why we would force simple calls or to ignore the cancel flag.
968 bool UseCancelBarrier =
969 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
970
971 Value *Result =
973 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
974 : OMPRTL___kmpc_barrier),
975 Args);
976
977 if (UseCancelBarrier && CheckCancelFlag)
978 emitCancelationCheckImpl(Result, OMPD_parallel);
979
980 return Builder.saveIP();
981}
982
985 Value *IfCondition,
986 omp::Directive CanceledDirective) {
987 if (!updateToLocation(Loc))
988 return Loc.IP;
989
990 // LLVM utilities like blocks with terminators.
991 auto *UI = Builder.CreateUnreachable();
992
993 Instruction *ThenTI = UI, *ElseTI = nullptr;
994 if (IfCondition)
995 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
996 Builder.SetInsertPoint(ThenTI);
997
998 Value *CancelKind = nullptr;
999 switch (CanceledDirective) {
1000#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1001 case DirectiveEnum: \
1002 CancelKind = Builder.getInt32(Value); \
1003 break;
1004#include "llvm/Frontend/OpenMP/OMPKinds.def"
1005 default:
1006 llvm_unreachable("Unknown cancel kind!");
1007 }
1008
1009 uint32_t SrcLocStrSize;
1010 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1011 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1012 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1013 Value *Result = Builder.CreateCall(
1014 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1015 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1016 if (CanceledDirective == OMPD_parallel) {
1018 Builder.restoreIP(IP);
1020 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1021 /* CheckCancelFlag */ false);
1022 }
1023 };
1024
1025 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1026 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1027
1028 // Update the insertion point and remove the terminator we introduced.
1029 Builder.SetInsertPoint(UI->getParent());
1030 UI->eraseFromParent();
1031
1032 return Builder.saveIP();
1033}
1034
1036 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1037 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1038 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1039 if (!updateToLocation(Loc))
1040 return Loc.IP;
1041
1042 Builder.restoreIP(AllocaIP);
1043 auto *KernelArgsPtr =
1044 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1045 Builder.restoreIP(Loc.IP);
1046
1047 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1048 llvm::Value *Arg =
1049 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1051 KernelArgs[I], Arg,
1052 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1053 }
1054
1055 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1056 NumThreads, HostPtr, KernelArgsPtr};
1057
1058 Return = Builder.CreateCall(
1059 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1060 OffloadingArgs);
1061
1062 return Builder.saveIP();
1063}
1064
1066 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1067 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1068 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1069
1070 if (!updateToLocation(Loc))
1071 return Loc.IP;
1072
1073 Builder.restoreIP(Loc.IP);
1074 // On top of the arrays that were filled up, the target offloading call
1075 // takes as arguments the device id as well as the host pointer. The host
1076 // pointer is used by the runtime library to identify the current target
1077 // region, so it only has to be unique and not necessarily point to
1078 // anything. It could be the pointer to the outlined function that
1079 // implements the target region, but we aren't using that so that the
1080 // compiler doesn't need to keep that, and could therefore inline the host
1081 // function if proven worthwhile during optimization.
1082
1083 // From this point on, we need to have an ID of the target region defined.
1084 assert(OutlinedFnID && "Invalid outlined function ID!");
1085 (void)OutlinedFnID;
1086
1087 // Return value of the runtime offloading call.
1088 Value *Return = nullptr;
1089
1090 // Arguments for the target kernel.
1091 SmallVector<Value *> ArgsVector;
1092 getKernelArgsVector(Args, Builder, ArgsVector);
1093
1094 // The target region is an outlined function launched by the runtime
1095 // via calls to __tgt_target_kernel().
1096 //
1097 // Note that on the host and CPU targets, the runtime implementation of
1098 // these calls simply call the outlined function without forking threads.
1099 // The outlined functions themselves have runtime calls to
1100 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1101 // the compiler in emitTeamsCall() and emitParallelCall().
1102 //
1103 // In contrast, on the NVPTX target, the implementation of
1104 // __tgt_target_teams() launches a GPU kernel with the requested number
1105 // of teams and threads so no additional calls to the runtime are required.
1106 // Check the error code and execute the host version if required.
1107 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1108 Args.NumTeams, Args.NumThreads,
1109 OutlinedFnID, ArgsVector));
1110
1111 BasicBlock *OffloadFailedBlock =
1112 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1113 BasicBlock *OffloadContBlock =
1114 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1116 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1117
1118 auto CurFn = Builder.GetInsertBlock()->getParent();
1119 emitBlock(OffloadFailedBlock, CurFn);
1120 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1121 emitBranch(OffloadContBlock);
1122 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1123 return Builder.saveIP();
1124}
1125
1127 omp::Directive CanceledDirective,
1128 FinalizeCallbackTy ExitCB) {
1129 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1130 "Unexpected cancellation!");
1131
1132 // For a cancel barrier we create two new blocks.
1134 BasicBlock *NonCancellationBlock;
1135 if (Builder.GetInsertPoint() == BB->end()) {
1136 // TODO: This branch will not be needed once we moved to the
1137 // OpenMPIRBuilder codegen completely.
1138 NonCancellationBlock = BasicBlock::Create(
1139 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1140 } else {
1141 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1144 }
1145 BasicBlock *CancellationBlock = BasicBlock::Create(
1146 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1147
1148 // Jump to them based on the return value.
1149 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1150 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1151 /* TODO weight */ nullptr, nullptr);
1152
1153 // From the cancellation block we finalize all variables and go to the
1154 // post finalization block that is known to the FiniCB callback.
1155 Builder.SetInsertPoint(CancellationBlock);
1156 if (ExitCB)
1157 ExitCB(Builder.saveIP());
1158 auto &FI = FinalizationStack.back();
1159 FI.FiniCB(Builder.saveIP());
1160
1161 // The continuation block is where code generation continues.
1162 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1163}
1164
1165// Callback used to create OpenMP runtime calls to support
1166// omp parallel clause for the device.
1167// We need to use this callback to replace call to the OutlinedFn in OuterFn
1168// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1170 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1171 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1172 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1173 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1174 // Add some known attributes.
1175 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1176 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1177 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1178 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1179 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1180 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1181
1182 assert(OutlinedFn.arg_size() >= 2 &&
1183 "Expected at least tid and bounded tid as arguments");
1184 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1185
1186 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1187 assert(CI && "Expected call instruction to outlined function");
1188 CI->getParent()->setName("omp_parallel");
1189
1190 Builder.SetInsertPoint(CI);
1191 Type *PtrTy = OMPIRBuilder->VoidPtr;
1192 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1193
1194 // Add alloca for kernel args
1195 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1196 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1197 AllocaInst *ArgsAlloca =
1198 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1199 Value *Args = ArgsAlloca;
1200 // Add address space cast if array for storing arguments is not allocated
1201 // in address space 0
1202 if (ArgsAlloca->getAddressSpace())
1203 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1204 Builder.restoreIP(CurrentIP);
1205
1206 // Store captured vars which are used by kmpc_parallel_51
1207 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1208 Value *V = *(CI->arg_begin() + 2 + Idx);
1209 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1210 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1211 Builder.CreateStore(V, StoreAddress);
1212 }
1213
1214 Value *Cond =
1215 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1216 : Builder.getInt32(1);
1217
1218 // Build kmpc_parallel_51 call
1219 Value *Parallel51CallArgs[] = {
1220 /* identifier*/ Ident,
1221 /* global thread num*/ ThreadID,
1222 /* if expression */ Cond,
1223 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1224 /* Proc bind */ Builder.getInt32(-1),
1225 /* outlined function */
1226 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1227 /* wrapper function */ NullPtrValue,
1228 /* arguments of the outlined funciton*/ Args,
1229 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1230
1231 FunctionCallee RTLFn =
1232 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1233
1234 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1235
1236 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1237 << *Builder.GetInsertBlock()->getParent() << "\n");
1238
1239 // Initialize the local TID stack location with the argument value.
1240 Builder.SetInsertPoint(PrivTID);
1241 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1242 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1243 PrivTIDAddr);
1244
1245 // Remove redundant call to the outlined function.
1246 CI->eraseFromParent();
1247
1248 for (Instruction *I : ToBeDeleted) {
1249 I->eraseFromParent();
1250 }
1251}
1252
1253// Callback used to create OpenMP runtime calls to support
1254// omp parallel clause for the host.
1255// We need to use this callback to replace call to the OutlinedFn in OuterFn
1256// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1257static void
1259 Function *OuterFn, Value *Ident, Value *IfCondition,
1260 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1261 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1262 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1263 FunctionCallee RTLFn;
1264 if (IfCondition) {
1265 RTLFn =
1266 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1267 } else {
1268 RTLFn =
1269 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1270 }
1271 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1272 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1273 LLVMContext &Ctx = F->getContext();
1274 MDBuilder MDB(Ctx);
1275 // Annotate the callback behavior of the __kmpc_fork_call:
1276 // - The callback callee is argument number 2 (microtask).
1277 // - The first two arguments of the callback callee are unknown (-1).
1278 // - All variadic arguments to the __kmpc_fork_call are passed to the
1279 // callback callee.
1280 F->addMetadata(LLVMContext::MD_callback,
1282 2, {-1, -1},
1283 /* VarArgsArePassed */ true)}));
1284 }
1285 }
1286 // Add some known attributes.
1287 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1288 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1289 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1290
1291 assert(OutlinedFn.arg_size() >= 2 &&
1292 "Expected at least tid and bounded tid as arguments");
1293 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1294
1295 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1296 CI->getParent()->setName("omp_parallel");
1297 Builder.SetInsertPoint(CI);
1298
1299 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1300 Value *ForkCallArgs[] = {
1301 Ident, Builder.getInt32(NumCapturedVars),
1302 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1303
1304 SmallVector<Value *, 16> RealArgs;
1305 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1306 if (IfCondition) {
1307 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1308 RealArgs.push_back(Cond);
1309 }
1310 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1311
1312 // __kmpc_fork_call_if always expects a void ptr as the last argument
1313 // If there are no arguments, pass a null pointer.
1314 auto PtrTy = OMPIRBuilder->VoidPtr;
1315 if (IfCondition && NumCapturedVars == 0) {
1316 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1317 RealArgs.push_back(NullPtrValue);
1318 }
1319 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1320 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1321
1322 Builder.CreateCall(RTLFn, RealArgs);
1323
1324 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1325 << *Builder.GetInsertBlock()->getParent() << "\n");
1326
1327 // Initialize the local TID stack location with the argument value.
1328 Builder.SetInsertPoint(PrivTID);
1329 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1330 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1331 PrivTIDAddr);
1332
1333 // Remove redundant call to the outlined function.
1334 CI->eraseFromParent();
1335
1336 for (Instruction *I : ToBeDeleted) {
1337 I->eraseFromParent();
1338 }
1339}
1340
1342 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1343 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1344 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1345 omp::ProcBindKind ProcBind, bool IsCancellable) {
1346 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1347
1348 if (!updateToLocation(Loc))
1349 return Loc.IP;
1350
1351 uint32_t SrcLocStrSize;
1352 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1353 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1354 Value *ThreadID = getOrCreateThreadID(Ident);
1355 // If we generate code for the target device, we need to allocate
1356 // struct for aggregate params in the device default alloca address space.
1357 // OpenMP runtime requires that the params of the extracted functions are
1358 // passed as zero address space pointers. This flag ensures that extracted
1359 // function arguments are declared in zero address space
1360 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1361
1362 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1363 // only if we compile for host side.
1364 if (NumThreads && !Config.isTargetDevice()) {
1365 Value *Args[] = {
1366 Ident, ThreadID,
1367 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1369 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1370 }
1371
1372 if (ProcBind != OMP_PROC_BIND_default) {
1373 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1374 Value *Args[] = {
1375 Ident, ThreadID,
1376 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1378 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1379 }
1380
1381 BasicBlock *InsertBB = Builder.GetInsertBlock();
1382 Function *OuterFn = InsertBB->getParent();
1383
1384 // Save the outer alloca block because the insertion iterator may get
1385 // invalidated and we still need this later.
1386 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1387
1388 // Vector to remember instructions we used only during the modeling but which
1389 // we want to delete at the end.
1391
1392 // Change the location to the outer alloca insertion point to create and
1393 // initialize the allocas we pass into the parallel region.
1394 Builder.restoreIP(OuterAllocaIP);
1395 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1396 AllocaInst *ZeroAddrAlloca =
1397 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1398 Instruction *TIDAddr = TIDAddrAlloca;
1399 Instruction *ZeroAddr = ZeroAddrAlloca;
1400 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1401 // Add additional casts to enforce pointers in zero address space
1402 TIDAddr = new AddrSpaceCastInst(
1403 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1404 TIDAddr->insertAfter(TIDAddrAlloca);
1405 ToBeDeleted.push_back(TIDAddr);
1406 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1407 PointerType ::get(M.getContext(), 0),
1408 "zero.addr.ascast");
1409 ZeroAddr->insertAfter(ZeroAddrAlloca);
1410 ToBeDeleted.push_back(ZeroAddr);
1411 }
1412
1413 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1414 // associated arguments in the outlined function, so we delete them later.
1415 ToBeDeleted.push_back(TIDAddrAlloca);
1416 ToBeDeleted.push_back(ZeroAddrAlloca);
1417
1418 // Create an artificial insertion point that will also ensure the blocks we
1419 // are about to split are not degenerated.
1420 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1421
1422 BasicBlock *EntryBB = UI->getParent();
1423 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1424 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1425 BasicBlock *PRegPreFiniBB =
1426 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1427 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1428
1429 auto FiniCBWrapper = [&](InsertPointTy IP) {
1430 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1431 // target to the region exit block.
1432 if (IP.getBlock()->end() == IP.getPoint()) {
1434 Builder.restoreIP(IP);
1435 Instruction *I = Builder.CreateBr(PRegExitBB);
1436 IP = InsertPointTy(I->getParent(), I->getIterator());
1437 }
1438 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1439 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1440 "Unexpected insertion point for finalization call!");
1441 return FiniCB(IP);
1442 };
1443
1444 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1445
1446 // Generate the privatization allocas in the block that will become the entry
1447 // of the outlined function.
1448 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1449 InsertPointTy InnerAllocaIP = Builder.saveIP();
1450
1451 AllocaInst *PrivTIDAddr =
1452 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1453 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1454
1455 // Add some fake uses for OpenMP provided arguments.
1456 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1457 Instruction *ZeroAddrUse =
1458 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1459 ToBeDeleted.push_back(ZeroAddrUse);
1460
1461 // EntryBB
1462 // |
1463 // V
1464 // PRegionEntryBB <- Privatization allocas are placed here.
1465 // |
1466 // V
1467 // PRegionBodyBB <- BodeGen is invoked here.
1468 // |
1469 // V
1470 // PRegPreFiniBB <- The block we will start finalization from.
1471 // |
1472 // V
1473 // PRegionExitBB <- A common exit to simplify block collection.
1474 //
1475
1476 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1477
1478 // Let the caller create the body.
1479 assert(BodyGenCB && "Expected body generation callback!");
1480 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1481 BodyGenCB(InnerAllocaIP, CodeGenIP);
1482
1483 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1484
1485 OutlineInfo OI;
1486 if (Config.isTargetDevice()) {
1487 // Generate OpenMP target specific runtime call
1488 OI.PostOutlineCB = [=, ToBeDeletedVec =
1489 std::move(ToBeDeleted)](Function &OutlinedFn) {
1490 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1491 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1492 ThreadID, ToBeDeletedVec);
1493 };
1494 } else {
1495 // Generate OpenMP host runtime call
1496 OI.PostOutlineCB = [=, ToBeDeletedVec =
1497 std::move(ToBeDeleted)](Function &OutlinedFn) {
1498 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1499 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1500 };
1501 }
1502
1503 OI.OuterAllocaBB = OuterAllocaBlock;
1504 OI.EntryBB = PRegEntryBB;
1505 OI.ExitBB = PRegExitBB;
1506
1507 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1509 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1510
1511 // Ensure a single exit node for the outlined region by creating one.
1512 // We might have multiple incoming edges to the exit now due to finalizations,
1513 // e.g., cancel calls that cause the control flow to leave the region.
1514 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1515 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1516 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1517 Blocks.push_back(PRegOutlinedExitBB);
1518
1519 CodeExtractorAnalysisCache CEAC(*OuterFn);
1520 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1521 /* AggregateArgs */ false,
1522 /* BlockFrequencyInfo */ nullptr,
1523 /* BranchProbabilityInfo */ nullptr,
1524 /* AssumptionCache */ nullptr,
1525 /* AllowVarArgs */ true,
1526 /* AllowAlloca */ true,
1527 /* AllocationBlock */ OuterAllocaBlock,
1528 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1529
1530 // Find inputs to, outputs from the code region.
1531 BasicBlock *CommonExit = nullptr;
1532 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1533 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1534 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1535
1536 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1537
1538 FunctionCallee TIDRTLFn =
1539 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1540
1541 auto PrivHelper = [&](Value &V) {
1542 if (&V == TIDAddr || &V == ZeroAddr) {
1543 OI.ExcludeArgsFromAggregate.push_back(&V);
1544 return;
1545 }
1546
1548 for (Use &U : V.uses())
1549 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1550 if (ParallelRegionBlockSet.count(UserI->getParent()))
1551 Uses.insert(&U);
1552
1553 // __kmpc_fork_call expects extra arguments as pointers. If the input
1554 // already has a pointer type, everything is fine. Otherwise, store the
1555 // value onto stack and load it back inside the to-be-outlined region. This
1556 // will ensure only the pointer will be passed to the function.
1557 // FIXME: if there are more than 15 trailing arguments, they must be
1558 // additionally packed in a struct.
1559 Value *Inner = &V;
1560 if (!V.getType()->isPointerTy()) {
1562 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1563
1564 Builder.restoreIP(OuterAllocaIP);
1565 Value *Ptr =
1566 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1567
1568 // Store to stack at end of the block that currently branches to the entry
1569 // block of the to-be-outlined region.
1570 Builder.SetInsertPoint(InsertBB,
1571 InsertBB->getTerminator()->getIterator());
1572 Builder.CreateStore(&V, Ptr);
1573
1574 // Load back next to allocations in the to-be-outlined region.
1575 Builder.restoreIP(InnerAllocaIP);
1576 Inner = Builder.CreateLoad(V.getType(), Ptr);
1577 }
1578
1579 Value *ReplacementValue = nullptr;
1580 CallInst *CI = dyn_cast<CallInst>(&V);
1581 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1582 ReplacementValue = PrivTID;
1583 } else {
1585 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1586 assert(ReplacementValue &&
1587 "Expected copy/create callback to set replacement value!");
1588 if (ReplacementValue == &V)
1589 return;
1590 }
1591
1592 for (Use *UPtr : Uses)
1593 UPtr->set(ReplacementValue);
1594 };
1595
1596 // Reset the inner alloca insertion as it will be used for loading the values
1597 // wrapped into pointers before passing them into the to-be-outlined region.
1598 // Configure it to insert immediately after the fake use of zero address so
1599 // that they are available in the generated body and so that the
1600 // OpenMP-related values (thread ID and zero address pointers) remain leading
1601 // in the argument list.
1602 InnerAllocaIP = IRBuilder<>::InsertPoint(
1603 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1604
1605 // Reset the outer alloca insertion point to the entry of the relevant block
1606 // in case it was invalidated.
1607 OuterAllocaIP = IRBuilder<>::InsertPoint(
1608 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1609
1610 for (Value *Input : Inputs) {
1611 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1612 PrivHelper(*Input);
1613 }
1614 LLVM_DEBUG({
1615 for (Value *Output : Outputs)
1616 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1617 });
1618 assert(Outputs.empty() &&
1619 "OpenMP outlining should not produce live-out values!");
1620
1621 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1622 LLVM_DEBUG({
1623 for (auto *BB : Blocks)
1624 dbgs() << " PBR: " << BB->getName() << "\n";
1625 });
1626
1627 // Adjust the finalization stack, verify the adjustment, and call the
1628 // finalize function a last time to finalize values between the pre-fini
1629 // block and the exit block if we left the parallel "the normal way".
1630 auto FiniInfo = FinalizationStack.pop_back_val();
1631 (void)FiniInfo;
1632 assert(FiniInfo.DK == OMPD_parallel &&
1633 "Unexpected finalization stack state!");
1634
1635 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1636
1637 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1638 FiniCB(PreFiniIP);
1639
1640 // Register the outlined info.
1641 addOutlineInfo(std::move(OI));
1642
1643 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1644 UI->eraseFromParent();
1645
1646 return AfterIP;
1647}
1648
1650 // Build call void __kmpc_flush(ident_t *loc)
1651 uint32_t SrcLocStrSize;
1652 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1653 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1654
1655 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1656}
1657
1659 if (!updateToLocation(Loc))
1660 return;
1661 emitFlush(Loc);
1662}
1663
1665 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1666 // global_tid);
1667 uint32_t SrcLocStrSize;
1668 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1669 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1670 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1671
1672 // Ignore return result until untied tasks are supported.
1673 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1674 Args);
1675}
1676
1678 if (!updateToLocation(Loc))
1679 return;
1680 emitTaskwaitImpl(Loc);
1681}
1682
1684 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1685 uint32_t SrcLocStrSize;
1686 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1687 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1689 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1690
1691 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1692 Args);
1693}
1694
1696 if (!updateToLocation(Loc))
1697 return;
1698 emitTaskyieldImpl(Loc);
1699}
1700
1703 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1704 bool Tied, Value *Final, Value *IfCondition,
1705 SmallVector<DependData> Dependencies) {
1706
1707 if (!updateToLocation(Loc))
1708 return InsertPointTy();
1709
1710 uint32_t SrcLocStrSize;
1711 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1712 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1713 // The current basic block is split into four basic blocks. After outlining,
1714 // they will be mapped as follows:
1715 // ```
1716 // def current_fn() {
1717 // current_basic_block:
1718 // br label %task.exit
1719 // task.exit:
1720 // ; instructions after task
1721 // }
1722 // def outlined_fn() {
1723 // task.alloca:
1724 // br label %task.body
1725 // task.body:
1726 // ret void
1727 // }
1728 // ```
1729 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1730 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1731 BasicBlock *TaskAllocaBB =
1732 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1733
1734 InsertPointTy TaskAllocaIP =
1735 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1736 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1737 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1738
1739 OutlineInfo OI;
1740 OI.EntryBB = TaskAllocaBB;
1741 OI.OuterAllocaBB = AllocaIP.getBlock();
1742 OI.ExitBB = TaskExitBB;
1743
1744 // Add the thread ID argument.
1745 std::stack<Instruction *> ToBeDeleted;
1747 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1748
1749 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1750 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1751 // Replace the Stale CI by appropriate RTL function call.
1752 assert(OutlinedFn.getNumUses() == 1 &&
1753 "there must be a single user for the outlined function");
1754 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1755
1756 // HasShareds is true if any variables are captured in the outlined region,
1757 // false otherwise.
1758 bool HasShareds = StaleCI->arg_size() > 1;
1759 Builder.SetInsertPoint(StaleCI);
1760
1761 // Gather the arguments for emitting the runtime call for
1762 // @__kmpc_omp_task_alloc
1763 Function *TaskAllocFn =
1764 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1765
1766 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1767 // call.
1768 Value *ThreadID = getOrCreateThreadID(Ident);
1769
1770 // Argument - `flags`
1771 // Task is tied iff (Flags & 1) == 1.
1772 // Task is untied iff (Flags & 1) == 0.
1773 // Task is final iff (Flags & 2) == 2.
1774 // Task is not final iff (Flags & 2) == 0.
1775 // TODO: Handle the other flags.
1776 Value *Flags = Builder.getInt32(Tied);
1777 if (Final) {
1778 Value *FinalFlag =
1780 Flags = Builder.CreateOr(FinalFlag, Flags);
1781 }
1782
1783 // Argument - `sizeof_kmp_task_t` (TaskSize)
1784 // Tasksize refers to the size in bytes of kmp_task_t data structure
1785 // including private vars accessed in task.
1786 // TODO: add kmp_task_t_with_privates (privates)
1787 Value *TaskSize = Builder.getInt64(
1789
1790 // Argument - `sizeof_shareds` (SharedsSize)
1791 // SharedsSize refers to the shareds array size in the kmp_task_t data
1792 // structure.
1793 Value *SharedsSize = Builder.getInt64(0);
1794 if (HasShareds) {
1795 AllocaInst *ArgStructAlloca =
1796 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1797 assert(ArgStructAlloca &&
1798 "Unable to find the alloca instruction corresponding to arguments "
1799 "for extracted function");
1800 StructType *ArgStructType =
1801 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1802 assert(ArgStructType && "Unable to find struct type corresponding to "
1803 "arguments for extracted function");
1804 SharedsSize =
1806 }
1807 // Emit the @__kmpc_omp_task_alloc runtime call
1808 // The runtime call returns a pointer to an area where the task captured
1809 // variables must be copied before the task is run (TaskData)
1810 CallInst *TaskData = Builder.CreateCall(
1811 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1812 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1813 /*task_func=*/&OutlinedFn});
1814
1815 // Copy the arguments for outlined function
1816 if (HasShareds) {
1817 Value *Shareds = StaleCI->getArgOperand(1);
1818 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1819 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1820 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1821 SharedsSize);
1822 }
1823
1824 Value *DepArray = nullptr;
1825 if (Dependencies.size()) {
1826 InsertPointTy OldIP = Builder.saveIP();
1828 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1829
1830 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1831 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1832
1833 unsigned P = 0;
1834 for (const DependData &Dep : Dependencies) {
1835 Value *Base =
1836 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1837 // Store the pointer to the variable
1839 DependInfo, Base,
1840 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1841 Value *DepValPtr =
1843 Builder.CreateStore(DepValPtr, Addr);
1844 // Store the size of the variable
1846 DependInfo, Base,
1847 static_cast<unsigned int>(RTLDependInfoFields::Len));
1849 Dep.DepValueType)),
1850 Size);
1851 // Store the dependency kind
1853 DependInfo, Base,
1854 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1856 ConstantInt::get(Builder.getInt8Ty(),
1857 static_cast<unsigned int>(Dep.DepKind)),
1858 Flags);
1859 ++P;
1860 }
1861
1862 Builder.restoreIP(OldIP);
1863 }
1864
1865 // In the presence of the `if` clause, the following IR is generated:
1866 // ...
1867 // %data = call @__kmpc_omp_task_alloc(...)
1868 // br i1 %if_condition, label %then, label %else
1869 // then:
1870 // call @__kmpc_omp_task(...)
1871 // br label %exit
1872 // else:
1873 // ;; Wait for resolution of dependencies, if any, before
1874 // ;; beginning the task
1875 // call @__kmpc_omp_wait_deps(...)
1876 // call @__kmpc_omp_task_begin_if0(...)
1877 // call @outlined_fn(...)
1878 // call @__kmpc_omp_task_complete_if0(...)
1879 // br label %exit
1880 // exit:
1881 // ...
1882 if (IfCondition) {
1883 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1884 // terminator.
1885 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1886 Instruction *IfTerminator =
1887 Builder.GetInsertPoint()->getParent()->getTerminator();
1888 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1889 Builder.SetInsertPoint(IfTerminator);
1890 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1891 &ElseTI);
1892 Builder.SetInsertPoint(ElseTI);
1893
1894 if (Dependencies.size()) {
1895 Function *TaskWaitFn =
1896 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1898 TaskWaitFn,
1899 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1900 ConstantInt::get(Builder.getInt32Ty(), 0),
1902 }
1903 Function *TaskBeginFn =
1904 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1905 Function *TaskCompleteFn =
1906 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1907 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1908 CallInst *CI = nullptr;
1909 if (HasShareds)
1910 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1911 else
1912 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1913 CI->setDebugLoc(StaleCI->getDebugLoc());
1914 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1915 Builder.SetInsertPoint(ThenTI);
1916 }
1917
1918 if (Dependencies.size()) {
1919 Function *TaskFn =
1920 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
1922 TaskFn,
1923 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
1924 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
1926
1927 } else {
1928 // Emit the @__kmpc_omp_task runtime call to spawn the task
1929 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
1930 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
1931 }
1932
1933 StaleCI->eraseFromParent();
1934
1935 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
1936 if (HasShareds) {
1937 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
1938 OutlinedFn.getArg(1)->replaceUsesWithIf(
1939 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
1940 }
1941
1942 while (!ToBeDeleted.empty()) {
1943 ToBeDeleted.top()->eraseFromParent();
1944 ToBeDeleted.pop();
1945 }
1946 };
1947
1948 addOutlineInfo(std::move(OI));
1949 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
1950
1951 return Builder.saveIP();
1952}
1953
1956 InsertPointTy AllocaIP,
1957 BodyGenCallbackTy BodyGenCB) {
1958 if (!updateToLocation(Loc))
1959 return InsertPointTy();
1960
1961 uint32_t SrcLocStrSize;
1962 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1963 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1964 Value *ThreadID = getOrCreateThreadID(Ident);
1965
1966 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1967 Function *TaskgroupFn =
1968 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
1969 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
1970
1971 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
1972 BodyGenCB(AllocaIP, Builder.saveIP());
1973
1974 Builder.SetInsertPoint(TaskgroupExitBB);
1975 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1976 Function *EndTaskgroupFn =
1977 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
1978 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
1979
1980 return Builder.saveIP();
1981}
1982
1984 const LocationDescription &Loc, InsertPointTy AllocaIP,
1986 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
1987 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
1988
1989 if (!updateToLocation(Loc))
1990 return Loc.IP;
1991
1992 auto FiniCBWrapper = [&](InsertPointTy IP) {
1993 if (IP.getBlock()->end() != IP.getPoint())
1994 return FiniCB(IP);
1995 // This must be done otherwise any nested constructs using FinalizeOMPRegion
1996 // will fail because that function requires the Finalization Basic Block to
1997 // have a terminator, which is already removed by EmitOMPRegionBody.
1998 // IP is currently at cancelation block.
1999 // We need to backtrack to the condition block to fetch
2000 // the exit block and create a branch from cancelation
2001 // to exit block.
2003 Builder.restoreIP(IP);
2004 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2005 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2006 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2007 Instruction *I = Builder.CreateBr(ExitBB);
2008 IP = InsertPointTy(I->getParent(), I->getIterator());
2009 return FiniCB(IP);
2010 };
2011
2012 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2013
2014 // Each section is emitted as a switch case
2015 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2016 // -> OMP.createSection() which generates the IR for each section
2017 // Iterate through all sections and emit a switch construct:
2018 // switch (IV) {
2019 // case 0:
2020 // <SectionStmt[0]>;
2021 // break;
2022 // ...
2023 // case <NumSection> - 1:
2024 // <SectionStmt[<NumSection> - 1]>;
2025 // break;
2026 // }
2027 // ...
2028 // section_loop.after:
2029 // <FiniCB>;
2030 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2031 Builder.restoreIP(CodeGenIP);
2033 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2034 Function *CurFn = Continue->getParent();
2035 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2036
2037 unsigned CaseNumber = 0;
2038 for (auto SectionCB : SectionCBs) {
2040 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2041 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2042 Builder.SetInsertPoint(CaseBB);
2043 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2044 SectionCB(InsertPointTy(),
2045 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2046 CaseNumber++;
2047 }
2048 // remove the existing terminator from body BB since there can be no
2049 // terminators after switch/case
2050 };
2051 // Loop body ends here
2052 // LowerBound, UpperBound, and STride for createCanonicalLoop
2053 Type *I32Ty = Type::getInt32Ty(M.getContext());
2054 Value *LB = ConstantInt::get(I32Ty, 0);
2055 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2056 Value *ST = ConstantInt::get(I32Ty, 1);
2058 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2059 InsertPointTy AfterIP =
2060 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2061
2062 // Apply the finalization callback in LoopAfterBB
2063 auto FiniInfo = FinalizationStack.pop_back_val();
2064 assert(FiniInfo.DK == OMPD_sections &&
2065 "Unexpected finalization stack state!");
2066 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2067 Builder.restoreIP(AfterIP);
2068 BasicBlock *FiniBB =
2069 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2070 CB(Builder.saveIP());
2071 AfterIP = {FiniBB, FiniBB->begin()};
2072 }
2073
2074 return AfterIP;
2075}
2076
2079 BodyGenCallbackTy BodyGenCB,
2080 FinalizeCallbackTy FiniCB) {
2081 if (!updateToLocation(Loc))
2082 return Loc.IP;
2083
2084 auto FiniCBWrapper = [&](InsertPointTy IP) {
2085 if (IP.getBlock()->end() != IP.getPoint())
2086 return FiniCB(IP);
2087 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2088 // will fail because that function requires the Finalization Basic Block to
2089 // have a terminator, which is already removed by EmitOMPRegionBody.
2090 // IP is currently at cancelation block.
2091 // We need to backtrack to the condition block to fetch
2092 // the exit block and create a branch from cancelation
2093 // to exit block.
2095 Builder.restoreIP(IP);
2096 auto *CaseBB = Loc.IP.getBlock();
2097 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2098 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2099 Instruction *I = Builder.CreateBr(ExitBB);
2100 IP = InsertPointTy(I->getParent(), I->getIterator());
2101 return FiniCB(IP);
2102 };
2103
2104 Directive OMPD = Directive::OMPD_sections;
2105 // Since we are using Finalization Callback here, HasFinalize
2106 // and IsCancellable have to be true
2107 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2108 /*Conditional*/ false, /*hasFinalize*/ true,
2109 /*IsCancellable*/ true);
2110}
2111
2112/// Create a function with a unique name and a "void (i8*, i8*)" signature in
2113/// the given module and return it.
2115 Type *VoidTy = Type::getVoidTy(M.getContext());
2116 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
2117 auto *FuncTy =
2118 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
2120 M.getDataLayout().getDefaultGlobalsAddressSpace(),
2121 ".omp.reduction.func", &M);
2122}
2123
2125 const LocationDescription &Loc, InsertPointTy AllocaIP,
2126 ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
2127 for (const ReductionInfo &RI : ReductionInfos) {
2128 (void)RI;
2129 assert(RI.Variable && "expected non-null variable");
2130 assert(RI.PrivateVariable && "expected non-null private variable");
2131 assert(RI.ReductionGen && "expected non-null reduction generator callback");
2132 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
2133 "expected variables and their private equivalents to have the same "
2134 "type");
2135 assert(RI.Variable->getType()->isPointerTy() &&
2136 "expected variables to be pointers");
2137 }
2138
2139 if (!updateToLocation(Loc))
2140 return InsertPointTy();
2141
2142 BasicBlock *InsertBlock = Loc.IP.getBlock();
2143 BasicBlock *ContinuationBlock =
2144 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
2145 InsertBlock->getTerminator()->eraseFromParent();
2146
2147 // Create and populate array of type-erased pointers to private reduction
2148 // values.
2149 unsigned NumReductions = ReductionInfos.size();
2150 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
2151 Builder.restoreIP(AllocaIP);
2152 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
2153
2154 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
2155
2156 for (auto En : enumerate(ReductionInfos)) {
2157 unsigned Index = En.index();
2158 const ReductionInfo &RI = En.value();
2159 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
2160 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
2161 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
2162 }
2163
2164 // Emit a call to the runtime function that orchestrates the reduction.
2165 // Declare the reduction function in the process.
2167 Module *Module = Func->getParent();
2168 uint32_t SrcLocStrSize;
2169 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2170 bool CanGenerateAtomic =
2171 llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
2172 return RI.AtomicReductionGen;
2173 });
2174 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
2175 CanGenerateAtomic
2176 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
2177 : IdentFlag(0));
2178 Value *ThreadId = getOrCreateThreadID(Ident);
2179 Constant *NumVariables = Builder.getInt32(NumReductions);
2180 const DataLayout &DL = Module->getDataLayout();
2181 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
2182 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
2183 Function *ReductionFunc = getFreshReductionFunc(*Module);
2184 Value *Lock = getOMPCriticalRegionLock(".reduction");
2186 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
2187 : RuntimeFunction::OMPRTL___kmpc_reduce);
2188 CallInst *ReduceCall =
2189 Builder.CreateCall(ReduceFunc,
2190 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
2191 ReductionFunc, Lock},
2192 "reduce");
2193
2194 // Create final reduction entry blocks for the atomic and non-atomic case.
2195 // Emit IR that dispatches control flow to one of the blocks based on the
2196 // reduction supporting the atomic mode.
2197 BasicBlock *NonAtomicRedBlock =
2198 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
2199 BasicBlock *AtomicRedBlock =
2200 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
2201 SwitchInst *Switch =
2202 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
2203 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
2204 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
2205
2206 // Populate the non-atomic reduction using the elementwise reduction function.
2207 // This loads the elements from the global and private variables and reduces
2208 // them before storing back the result to the global variable.
2209 Builder.SetInsertPoint(NonAtomicRedBlock);
2210 for (auto En : enumerate(ReductionInfos)) {
2211 const ReductionInfo &RI = En.value();
2213 // We have one less load for by-ref case because that load is now inside of
2214 // the reduction region
2215 Value *RedValue = nullptr;
2216 if (!IsByRef) {
2217 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
2218 "red.value." + Twine(En.index()));
2219 }
2220 Value *PrivateRedValue =
2222 "red.private.value." + Twine(En.index()));
2223 Value *Reduced;
2224 if (IsByRef) {
2226 PrivateRedValue, Reduced));
2227 } else {
2229 PrivateRedValue, Reduced));
2230 }
2231 if (!Builder.GetInsertBlock())
2232 return InsertPointTy();
2233 // for by-ref case, the load is inside of the reduction region
2234 if (!IsByRef)
2235 Builder.CreateStore(Reduced, RI.Variable);
2236 }
2237 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
2238 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
2239 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
2240 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
2241 Builder.CreateBr(ContinuationBlock);
2242
2243 // Populate the atomic reduction using the atomic elementwise reduction
2244 // function. There are no loads/stores here because they will be happening
2245 // inside the atomic elementwise reduction.
2246 Builder.SetInsertPoint(AtomicRedBlock);
2247 if (CanGenerateAtomic && !IsByRef) {
2248 for (const ReductionInfo &RI : ReductionInfos) {
2250 RI.Variable, RI.PrivateVariable));
2251 if (!Builder.GetInsertBlock())
2252 return InsertPointTy();
2253 }
2254 Builder.CreateBr(ContinuationBlock);
2255 } else {
2257 }
2258
2259 // Populate the outlined reduction function using the elementwise reduction
2260 // function. Partial values are extracted from the type-erased array of
2261 // pointers to private variables.
2262 BasicBlock *ReductionFuncBlock =
2263 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
2264 Builder.SetInsertPoint(ReductionFuncBlock);
2265 Value *LHSArrayPtr = ReductionFunc->getArg(0);
2266 Value *RHSArrayPtr = ReductionFunc->getArg(1);
2267
2268 for (auto En : enumerate(ReductionInfos)) {
2269 const ReductionInfo &RI = En.value();
2271 RedArrayTy, LHSArrayPtr, 0, En.index());
2272 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
2273 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
2274 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
2276 RedArrayTy, RHSArrayPtr, 0, En.index());
2277 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
2278 Value *RHSPtr =
2280 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
2281 Value *Reduced;
2283 if (!Builder.GetInsertBlock())
2284 return InsertPointTy();
2285 // store is inside of the reduction region when using by-ref
2286 if (!IsByRef)
2287 Builder.CreateStore(Reduced, LHSPtr);
2288 }
2290
2291 Builder.SetInsertPoint(ContinuationBlock);
2292 return Builder.saveIP();
2293}
2294
2297 BodyGenCallbackTy BodyGenCB,
2298 FinalizeCallbackTy FiniCB) {
2299
2300 if (!updateToLocation(Loc))
2301 return Loc.IP;
2302
2303 Directive OMPD = Directive::OMPD_master;
2304 uint32_t SrcLocStrSize;
2305 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2306 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2307 Value *ThreadId = getOrCreateThreadID(Ident);
2308 Value *Args[] = {Ident, ThreadId};
2309
2310 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
2311 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
2312
2313 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
2314 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
2315
2316 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2317 /*Conditional*/ true, /*hasFinalize*/ true);
2318}
2319
2322 BodyGenCallbackTy BodyGenCB,
2323 FinalizeCallbackTy FiniCB, Value *Filter) {
2324 if (!updateToLocation(Loc))
2325 return Loc.IP;
2326
2327 Directive OMPD = Directive::OMPD_masked;
2328 uint32_t SrcLocStrSize;
2329 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2330 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2331 Value *ThreadId = getOrCreateThreadID(Ident);
2332 Value *Args[] = {Ident, ThreadId, Filter};
2333 Value *ArgsEnd[] = {Ident, ThreadId};
2334
2335 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
2336 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
2337
2338 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
2339 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
2340
2341 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2342 /*Conditional*/ true, /*hasFinalize*/ true);
2343}
2344
2346 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
2347 BasicBlock *PostInsertBefore, const Twine &Name) {
2348 Module *M = F->getParent();
2349 LLVMContext &Ctx = M->getContext();
2350 Type *IndVarTy = TripCount->getType();
2351
2352 // Create the basic block structure.
2353 BasicBlock *Preheader =
2354 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
2355 BasicBlock *Header =
2356 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
2357 BasicBlock *Cond =
2358 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
2359 BasicBlock *Body =
2360 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
2361 BasicBlock *Latch =
2362 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
2363 BasicBlock *Exit =
2364 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
2365 BasicBlock *After =
2366 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
2367
2368 // Use specified DebugLoc for new instructions.
2370
2371 Builder.SetInsertPoint(Preheader);
2372 Builder.CreateBr(Header);
2373
2374 Builder.SetInsertPoint(Header);
2375 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
2376 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
2378
2380 Value *Cmp =
2381 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
2382 Builder.CreateCondBr(Cmp, Body, Exit);
2383
2384 Builder.SetInsertPoint(Body);
2385 Builder.CreateBr(Latch);
2386
2387 Builder.SetInsertPoint(Latch);
2388 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
2389 "omp_" + Name + ".next", /*HasNUW=*/true);
2390 Builder.CreateBr(Header);
2391 IndVarPHI->addIncoming(Next, Latch);
2392
2393 Builder.SetInsertPoint(Exit);
2395
2396 // Remember and return the canonical control flow.
2397 LoopInfos.emplace_front();
2398 CanonicalLoopInfo *CL = &LoopInfos.front();
2399
2400 CL->Header = Header;
2401 CL->Cond = Cond;
2402 CL->Latch = Latch;
2403 CL->Exit = Exit;
2404
2405#ifndef NDEBUG
2406 CL->assertOK();
2407#endif
2408 return CL;
2409}
2410
2413 LoopBodyGenCallbackTy BodyGenCB,
2414 Value *TripCount, const Twine &Name) {
2415 BasicBlock *BB = Loc.IP.getBlock();
2416 BasicBlock *NextBB = BB->getNextNode();
2417
2418 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
2419 NextBB, NextBB, Name);
2420 BasicBlock *After = CL->getAfter();
2421
2422 // If location is not set, don't connect the loop.
2423 if (updateToLocation(Loc)) {
2424 // Split the loop at the insertion point: Branch to the preheader and move
2425 // every following instruction to after the loop (the After BB). Also, the
2426 // new successor is the loop's after block.
2427 spliceBB(Builder, After, /*CreateBranch=*/false);
2429 }
2430
2431 // Emit the body content. We do it after connecting the loop to the CFG to
2432 // avoid that the callback encounters degenerate BBs.
2433 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
2434
2435#ifndef NDEBUG
2436 CL->assertOK();
2437#endif
2438 return CL;
2439}
2440
2442 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
2443 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
2444 InsertPointTy ComputeIP, const Twine &Name) {
2445
2446 // Consider the following difficulties (assuming 8-bit signed integers):
2447 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
2448 // DO I = 1, 100, 50
2449 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
2450 // DO I = 100, 0, -128
2451
2452 // Start, Stop and Step must be of the same integer type.
2453 auto *IndVarTy = cast<IntegerType>(Start->getType());
2454 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
2455 assert(IndVarTy == Step->getType() && "Step type mismatch");
2456
2457 LocationDescription ComputeLoc =
2458 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
2459 updateToLocation(ComputeLoc);
2460
2461 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
2462 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
2463
2464 // Like Step, but always positive.
2465 Value *Incr = Step;
2466
2467 // Distance between Start and Stop; always positive.
2468 Value *Span;
2469
2470 // Condition whether there are no iterations are executed at all, e.g. because
2471 // UB < LB.
2472 Value *ZeroCmp;
2473
2474 if (IsSigned) {
2475 // Ensure that increment is positive. If not, negate and invert LB and UB.
2476 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
2477 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
2478 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
2479 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
2480 Span = Builder.CreateSub(UB, LB, "", false, true);
2481 ZeroCmp = Builder.CreateICmp(
2482 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
2483 } else {
2484 Span = Builder.CreateSub(Stop, Start, "", true);
2485 ZeroCmp = Builder.CreateICmp(
2486 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
2487 }
2488
2489 Value *CountIfLooping;
2490 if (InclusiveStop) {
2491 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
2492 } else {
2493 // Avoid incrementing past stop since it could overflow.
2494 Value *CountIfTwo = Builder.CreateAdd(
2495 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
2496 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
2497 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
2498 }
2499 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
2500 "omp_" + Name + ".tripcount");
2501
2502 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
2503 Builder.restoreIP(CodeGenIP);
2504 Value *Span = Builder.CreateMul(IV, Step);
2505 Value *IndVar = Builder.CreateAdd(Span, Start);
2506 BodyGenCB(Builder.saveIP(), IndVar);
2507 };
2508 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
2509 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
2510}
2511
2512// Returns an LLVM function to call for initializing loop bounds using OpenMP
2513// static scheduling depending on `type`. Only i32 and i64 are supported by the
2514// runtime. Always interpret integers as unsigned similarly to
2515// CanonicalLoopInfo.
2517 OpenMPIRBuilder &OMPBuilder) {
2518 unsigned Bitwidth = Ty->getIntegerBitWidth();
2519 if (Bitwidth == 32)
2520 return OMPBuilder.getOrCreateRuntimeFunction(
2521 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
2522 if (Bitwidth == 64)
2523 return OMPBuilder.getOrCreateRuntimeFunction(
2524 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
2525 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2526}
2527
2529OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
2530 InsertPointTy AllocaIP,
2531 bool NeedsBarrier) {
2532 assert(CLI->isValid() && "Requires a valid canonical loop");
2533 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
2534 "Require dedicated allocate IP");
2535
2536 // Set up the source location value for OpenMP runtime.
2539
2540 uint32_t SrcLocStrSize;
2541 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2542 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2543
2544 // Declare useful OpenMP runtime functions.
2545 Value *IV = CLI->getIndVar();
2546 Type *IVTy = IV->getType();
2547 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
2548 FunctionCallee StaticFini =
2549 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2550
2551 // Allocate space for computed loop bounds as expected by the "init" function.
2552 Builder.restoreIP(AllocaIP);
2553 Type *I32Type = Type::getInt32Ty(M.getContext());
2554 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2555 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
2556 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
2557 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
2558
2559 // At the end of the preheader, prepare for calling the "init" function by
2560 // storing the current loop bounds into the allocated space. A canonical loop
2561 // always iterates from 0 to trip-count with step 1. Note that "init" expects
2562 // and produces an inclusive upper bound.
2564 Constant *Zero = ConstantInt::get(IVTy, 0);
2565 Constant *One = ConstantInt::get(IVTy, 1);
2566 Builder.CreateStore(Zero, PLowerBound);
2567 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
2568 Builder.CreateStore(UpperBound, PUpperBound);
2569 Builder.CreateStore(One, PStride);
2570
2571 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2572
2573 Constant *SchedulingType = ConstantInt::get(
2574 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
2575
2576 // Call the "init" function and update the trip count of the loop with the
2577 // value it produced.
2578 Builder.CreateCall(StaticInit,
2579 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
2580 PUpperBound, PStride, One, Zero});
2581 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
2582 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
2583 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
2584 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
2585 CLI->setTripCount(TripCount);
2586
2587 // Update all uses of the induction variable except the one in the condition
2588 // block that compares it with the actual upper bound, and the increment in
2589 // the latch block.
2590
2591 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
2593 CLI->getBody()->getFirstInsertionPt());
2595 return Builder.CreateAdd(OldIV, LowerBound);
2596 });
2597
2598 // In the "exit" block, call the "fini" function.
2600 CLI->getExit()->getTerminator()->getIterator());
2601 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2602
2603 // Add the barrier if requested.
2604 if (NeedsBarrier)
2605 createBarrier(LocationDescription(Builder.saveIP(), DL),
2606 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
2607 /* CheckCancelFlag */ false);
2608
2609 InsertPointTy AfterIP = CLI->getAfterIP();
2610 CLI->invalidate();
2611
2612 return AfterIP;
2613}
2614
2615OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
2616 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2617 bool NeedsBarrier, Value *ChunkSize) {
2618 assert(CLI->isValid() && "Requires a valid canonical loop");
2619 assert(ChunkSize && "Chunk size is required");
2620
2621 LLVMContext &Ctx = CLI->getFunction()->getContext();
2622 Value *IV = CLI->getIndVar();
2623 Value *OrigTripCount = CLI->getTripCount();
2624 Type *IVTy = IV->getType();
2625 assert(IVTy->getIntegerBitWidth() <= 64 &&
2626 "Max supported tripcount bitwidth is 64 bits");
2627 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
2628 : Type::getInt64Ty(Ctx);
2629 Type *I32Type = Type::getInt32Ty(M.getContext());
2630 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
2631 Constant *One = ConstantInt::get(InternalIVTy, 1);
2632
2633 // Declare useful OpenMP runtime functions.
2634 FunctionCallee StaticInit =
2635 getKmpcForStaticInitForType(InternalIVTy, M, *this);
2636 FunctionCallee StaticFini =
2637 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2638
2639 // Allocate space for computed loop bounds as expected by the "init" function.
2640 Builder.restoreIP(AllocaIP);
2642 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2643 Value *PLowerBound =
2644 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
2645 Value *PUpperBound =
2646 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
2647 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
2648
2649 // Set up the source location value for the OpenMP runtime.
2652
2653 // TODO: Detect overflow in ubsan or max-out with current tripcount.
2654 Value *CastedChunkSize =
2655 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
2656 Value *CastedTripCount =
2657 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
2658
2659 Constant *SchedulingType = ConstantInt::get(
2660 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
2661 Builder.CreateStore(Zero, PLowerBound);
2662 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
2663 Builder.CreateStore(OrigUpperBound, PUpperBound);
2664 Builder.CreateStore(One, PStride);
2665
2666 // Call the "init" function and update the trip count of the loop with the
2667 // value it produced.
2668 uint32_t SrcLocStrSize;
2669 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2670 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2671 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2672 Builder.CreateCall(StaticInit,
2673 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
2674 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
2675 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
2676 /*pstride=*/PStride, /*incr=*/One,
2677 /*chunk=*/CastedChunkSize});
2678
2679 // Load values written by the "init" function.
2680 Value *FirstChunkStart =
2681 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
2682 Value *FirstChunkStop =
2683 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
2684 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
2685 Value *ChunkRange =
2686 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
2687 Value *NextChunkStride =
2688 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
2689
2690 // Create outer "dispatch" loop for enumerating the chunks.
2691 BasicBlock *DispatchEnter = splitBB(Builder, true);
2692 Value *DispatchCounter;
2694 {Builder.saveIP(), DL},
2695 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
2696 FirstChunkStart, CastedTripCount, NextChunkStride,
2697 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
2698 "dispatch");
2699
2700 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
2701 // not have to preserve the canonical invariant.
2702 BasicBlock *DispatchBody = DispatchCLI->getBody();
2703 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
2704 BasicBlock *DispatchExit = DispatchCLI->getExit();
2705 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
2706 DispatchCLI->invalidate();
2707
2708 // Rewire the original loop to become the chunk loop inside the dispatch loop.
2709 redirectTo(DispatchAfter, CLI->getAfter(), DL);
2710 redirectTo(CLI->getExit(), DispatchLatch, DL);
2711 redirectTo(DispatchBody, DispatchEnter, DL);
2712
2713 // Prepare the prolog of the chunk loop.
2716
2717 // Compute the number of iterations of the chunk loop.
2719 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
2720 Value *IsLastChunk =
2721 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
2722 Value *CountUntilOrigTripCount =
2723 Builder.CreateSub(CastedTripCount, DispatchCounter);
2724 Value *ChunkTripCount = Builder.CreateSelect(
2725 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
2726 Value *BackcastedChunkTC =
2727 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
2728 CLI->setTripCount(BackcastedChunkTC);
2729
2730 // Update all uses of the induction variable except the one in the condition
2731 // block that compares it with the actual upper bound, and the increment in
2732 // the latch block.
2733 Value *BackcastedDispatchCounter =
2734 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
2735 CLI->mapIndVar([&](Instruction *) -> Value * {
2736 Builder.restoreIP(CLI->getBodyIP());
2737 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
2738 });
2739
2740 // In the "exit" block, call the "fini" function.
2741 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
2742 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2743
2744 // Add the barrier if requested.
2745 if (NeedsBarrier)
2746 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
2747 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
2748
2749#ifndef NDEBUG
2750 // Even though we currently do not support applying additional methods to it,
2751 // the chunk loop should remain a canonical loop.
2752 CLI->assertOK();
2753#endif
2754
2755 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
2756}
2757
2758// Returns an LLVM function to call for executing an OpenMP static worksharing
2759// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
2760// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
2761static FunctionCallee
2763 WorksharingLoopType LoopType) {
2764 unsigned Bitwidth = Ty->getIntegerBitWidth();
2765 Module &M = OMPBuilder->M;
2766 switch (LoopType) {
2767 case WorksharingLoopType::ForStaticLoop:
2768 if (Bitwidth == 32)
2769 return OMPBuilder->getOrCreateRuntimeFunction(
2770 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
2771 if (Bitwidth == 64)
2772 return OMPBuilder->getOrCreateRuntimeFunction(
2773 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
2774 break;
2775 case WorksharingLoopType::DistributeStaticLoop:
2776 if (Bitwidth == 32)
2777 return OMPBuilder->getOrCreateRuntimeFunction(
2778 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
2779 if (Bitwidth == 64)
2780 return OMPBuilder->getOrCreateRuntimeFunction(
2781 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
2782 break;
2783 case WorksharingLoopType::DistributeForStaticLoop:
2784 if (Bitwidth == 32)
2785 return OMPBuilder->getOrCreateRuntimeFunction(
2786 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
2787 if (Bitwidth == 64)
2788 return OMPBuilder->getOrCreateRuntimeFunction(
2789 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
2790 break;
2791 }
2792 if (Bitwidth != 32 && Bitwidth != 64) {
2793 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
2794 }
2795 llvm_unreachable("Unknown type of OpenMP worksharing loop");
2796}
2797
2798// Inserts a call to proper OpenMP Device RTL function which handles
2799// loop worksharing.
2801 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
2802 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
2803 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
2804 Type *TripCountTy = TripCount->getType();
2805 Module &M = OMPBuilder->M;
2806 IRBuilder<> &Builder = OMPBuilder->Builder;
2807 FunctionCallee RTLFn =
2808 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
2809 SmallVector<Value *, 8> RealArgs;
2810 RealArgs.push_back(Ident);
2811 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
2812 RealArgs.push_back(LoopBodyArg);
2813 RealArgs.push_back(TripCount);
2814 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
2815 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2816 Builder.CreateCall(RTLFn, RealArgs);
2817 return;
2818 }
2819 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
2820 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
2821 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
2822 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
2823
2824 RealArgs.push_back(
2825 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
2826 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2827 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
2828 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2829 }
2830
2831 Builder.CreateCall(RTLFn, RealArgs);
2832}
2833
2834static void
2836 CanonicalLoopInfo *CLI, Value *Ident,
2837 Function &OutlinedFn, Type *ParallelTaskPtr,
2838 const SmallVector<Instruction *, 4> &ToBeDeleted,
2839 WorksharingLoopType LoopType) {
2840 IRBuilder<> &Builder = OMPIRBuilder->Builder;
2841 BasicBlock *Preheader = CLI->getPreheader();
2842 Value *TripCount = CLI->getTripCount();
2843
2844 // After loop body outling, the loop body contains only set up
2845 // of loop body argument structure and the call to the outlined
2846 // loop body function. Firstly, we need to move setup of loop body args
2847 // into loop preheader.
2848 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
2849 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
2850
2851 // The next step is to remove the whole loop. We do not it need anymore.
2852 // That's why make an unconditional branch from loop preheader to loop
2853 // exit block
2854 Builder.restoreIP({Preheader, Preheader->end()});
2855 Preheader->getTerminator()->eraseFromParent();
2856 Builder.CreateBr(CLI->getExit());
2857
2858 // Delete dead loop blocks
2859 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
2860 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
2861 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
2862 CleanUpInfo.EntryBB = CLI->getHeader();
2863 CleanUpInfo.ExitBB = CLI->getExit();
2864 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
2865 DeleteDeadBlocks(BlocksToBeRemoved);
2866
2867 // Find the instruction which corresponds to loop body argument structure
2868 // and remove the call to loop body function instruction.
2869 Value *LoopBodyArg;
2870 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
2871 assert(OutlinedFnUser &&
2872 "Expected unique undroppable user of outlined function");
2873 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
2874 assert(OutlinedFnCallInstruction && "Expected outlined function call");
2875 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
2876 "Expected outlined function call to be located in loop preheader");
2877 // Check in case no argument structure has been passed.
2878 if (OutlinedFnCallInstruction->arg_size() > 1)
2879 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
2880 else
2881 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
2882 OutlinedFnCallInstruction->eraseFromParent();
2883
2884 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
2885 LoopBodyArg, ParallelTaskPtr, TripCount,
2886 OutlinedFn);
2887
2888 for (auto &ToBeDeletedItem : ToBeDeleted)
2889 ToBeDeletedItem->eraseFromParent();
2890 CLI->invalidate();
2891}
2892
2894OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
2895 InsertPointTy AllocaIP,
2896 WorksharingLoopType LoopType) {
2897 uint32_t SrcLocStrSize;
2898 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2899 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2900
2901 OutlineInfo OI;
2902 OI.OuterAllocaBB = CLI->getPreheader();
2903 Function *OuterFn = CLI->getPreheader()->getParent();
2904
2905 // Instructions which need to be deleted at the end of code generation
2907
2908 OI.OuterAllocaBB = AllocaIP.getBlock();
2909
2910 // Mark the body loop as region which needs to be extracted
2911 OI.EntryBB = CLI->getBody();
2912 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
2913 "omp.prelatch", true);
2914
2915 // Prepare loop body for extraction
2916 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
2917
2918 // Insert new loop counter variable which will be used only in loop
2919 // body.
2920 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
2921 Instruction *NewLoopCntLoad =
2922 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
2923 // New loop counter instructions are redundant in the loop preheader when
2924 // code generation for workshare loop is finshed. That's why mark them as
2925 // ready for deletion.
2926 ToBeDeleted.push_back(NewLoopCntLoad);
2927 ToBeDeleted.push_back(NewLoopCnt);
2928
2929 // Analyse loop body region. Find all input variables which are used inside
2930 // loop body region.
2931 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
2933 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
2934 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
2935 ParallelRegionBlockSet.end());
2936
2937 CodeExtractorAnalysisCache CEAC(*OuterFn);
2938 CodeExtractor Extractor(Blocks,
2939 /* DominatorTree */ nullptr,
2940 /* AggregateArgs */ true,
2941 /* BlockFrequencyInfo */ nullptr,
2942 /* BranchProbabilityInfo */ nullptr,
2943 /* AssumptionCache */ nullptr,
2944 /* AllowVarArgs */ true,
2945 /* AllowAlloca */ true,
2946 /* AllocationBlock */ CLI->getPreheader(),
2947 /* Suffix */ ".omp_wsloop",
2948 /* AggrArgsIn0AddrSpace */ true);
2949
2950 BasicBlock *CommonExit = nullptr;
2951 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
2952
2953 // Find allocas outside the loop body region which are used inside loop
2954 // body
2955 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
2956
2957 // We need to model loop body region as the function f(cnt, loop_arg).
2958 // That's why we replace loop induction variable by the new counter
2959 // which will be one of loop body function argument
2961 CLI->getIndVar()->user_end());
2962 for (auto Use : Users) {
2963 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
2964 if (ParallelRegionBlockSet.count(Inst->getParent())) {
2965 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
2966 }
2967 }
2968 }
2969 // Make sure that loop counter variable is not merged into loop body
2970 // function argument structure and it is passed as separate variable
2971 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
2972
2973 // PostOutline CB is invoked when loop body function is outlined and
2974 // loop body is replaced by call to outlined function. We need to add
2975 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
2976 // function will handle loop control logic.
2977 //
2978 OI.PostOutlineCB = [=, ToBeDeletedVec =
2979 std::move(ToBeDeleted)](Function &OutlinedFn) {
2980 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
2981 ToBeDeletedVec, LoopType);
2982 };
2983 addOutlineInfo(std::move(OI));
2984 return CLI->getAfterIP();
2985}
2986
2989 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
2990 bool HasSimdModifier, bool HasMonotonicModifier,
2991 bool HasNonmonotonicModifier, bool HasOrderedClause,
2992 WorksharingLoopType LoopType) {
2993 if (Config.isTargetDevice())
2994 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
2995 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
2996 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
2997 HasNonmonotonicModifier, HasOrderedClause);
2998
2999 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
3000 OMPScheduleType::ModifierOrdered;
3001 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
3002 case OMPScheduleType::BaseStatic:
3003 assert(!ChunkSize && "No chunk size with static-chunked schedule");
3004 if (IsOrdered)
3005 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
3006 NeedsBarrier, ChunkSize);
3007 // FIXME: Monotonicity ignored?
3008 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
3009
3010 case OMPScheduleType::BaseStaticChunked:
3011 if (IsOrdered)
3012 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
3013 NeedsBarrier, ChunkSize);
3014 // FIXME: Monotonicity ignored?
3015 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
3016 ChunkSize);
3017
3018 case OMPScheduleType::BaseRuntime:
3019 case OMPScheduleType::BaseAuto:
3020 case OMPScheduleType::BaseGreedy:
3021 case OMPScheduleType::BaseBalanced:
3022 case OMPScheduleType::BaseSteal:
3023 case OMPScheduleType::BaseGuidedSimd:
3024 case OMPScheduleType::BaseRuntimeSimd:
3025 assert(!ChunkSize &&
3026 "schedule type does not support user-defined chunk sizes");
3027 [[fallthrough]];
3028 case OMPScheduleType::BaseDynamicChunked:
3029 case OMPScheduleType::BaseGuidedChunked:
3030 case OMPScheduleType::BaseGuidedIterativeChunked:
3031 case OMPScheduleType::BaseGuidedAnalyticalChunked:
3032 case OMPScheduleType::BaseStaticBalancedChunked:
3033 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
3034 NeedsBarrier, ChunkSize);
3035
3036 default:
3037 llvm_unreachable("Unknown/unimplemented schedule kind");
3038 }
3039}
3040
3041/// Returns an LLVM function to call for initializing loop bounds using OpenMP
3042/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
3043/// the runtime. Always interpret integers as unsigned similarly to
3044/// CanonicalLoopInfo.
3045static FunctionCallee
3047 unsigned Bitwidth = Ty->getIntegerBitWidth();
3048 if (Bitwidth == 32)
3049 return OMPBuilder.getOrCreateRuntimeFunction(
3050 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
3051 if (Bitwidth == 64)
3052 return OMPBuilder.getOrCreateRuntimeFunction(
3053 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
3054 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3055}
3056
3057/// Returns an LLVM function to call for updating the next loop using OpenMP
3058/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
3059/// the runtime. Always interpret integers as unsigned similarly to
3060/// CanonicalLoopInfo.
3061static FunctionCallee
3063 unsigned Bitwidth = Ty->getIntegerBitWidth();
3064 if (Bitwidth == 32)
3065 return OMPBuilder.getOrCreateRuntimeFunction(
3066 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
3067 if (Bitwidth == 64)
3068 return OMPBuilder.getOrCreateRuntimeFunction(
3069 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
3070 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3071}
3072
3073/// Returns an LLVM function to call for finalizing the dynamic loop using
3074/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
3075/// interpret integers as unsigned similarly to CanonicalLoopInfo.
3076static FunctionCallee
3078 unsigned Bitwidth = Ty->getIntegerBitWidth();
3079 if (Bitwidth == 32)
3080 return OMPBuilder.getOrCreateRuntimeFunction(
3081 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
3082 if (Bitwidth == 64)
3083 return OMPBuilder.getOrCreateRuntimeFunction(
3084 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
3085 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3086}
3087
3088OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
3089 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
3090 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
3091 assert(CLI->isValid() && "Requires a valid canonical loop");
3092 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3093 "Require dedicated allocate IP");
3095 "Require valid schedule type");
3096
3097 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
3098 OMPScheduleType::ModifierOrdered;
3099
3100 // Set up the source location value for OpenMP runtime.
3102
3103 uint32_t SrcLocStrSize;
3104 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
3105 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3106
3107 // Declare useful OpenMP runtime functions.
3108 Value *IV = CLI->getIndVar();
3109 Type *IVTy = IV->getType();
3110 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
3111 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
3112
3113 // Allocate space for computed loop bounds as expected by the "init" function.
3114 Builder.restoreIP(AllocaIP);
3115 Type *I32Type = Type::getInt32Ty(M.getContext());
3116 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
3117 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
3118 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
3119 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
3120
3121 // At the end of the preheader, prepare for calling the "init" function by
3122 // storing the current loop bounds into the allocated space. A canonical loop
3123 // always iterates from 0 to trip-count with step 1. Note that "init" expects
3124 // and produces an inclusive upper bound.
3125 BasicBlock *PreHeader = CLI->getPreheader();
3126 Builder.SetInsertPoint(PreHeader->getTerminator());
3127 Constant *One = ConstantInt::get(IVTy, 1);
3128 Builder.CreateStore(One, PLowerBound);
3129 Value *UpperBound = CLI->getTripCount();
3130 Builder.CreateStore(UpperBound, PUpperBound);
3131 Builder.CreateStore(One, PStride);
3132
3133 BasicBlock *Header = CLI->getHeader();
3134 BasicBlock *Exit = CLI->getExit();
3135 BasicBlock *Cond = CLI->getCond();
3136 BasicBlock *Latch = CLI->getLatch();
3137 InsertPointTy AfterIP = CLI->getAfterIP();
3138
3139 // The CLI will be "broken" in the code below, as the loop is no longer
3140 // a valid canonical loop.
3141
3142 if (!Chunk)
3143 Chunk = One;
3144
3145 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
3146
3147 Constant *SchedulingType =
3148 ConstantInt::get(I32Type, static_cast<int>(SchedType));
3149
3150 // Call the "init" function.
3151 Builder.CreateCall(DynamicInit,
3152 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
3153 UpperBound, /* step */ One, Chunk});
3154
3155 // An outer loop around the existing one.
3156 BasicBlock *OuterCond = BasicBlock::Create(
3157 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
3158 PreHeader->getParent());
3159 // This needs to be 32-bit always, so can't use the IVTy Zero above.
3160 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
3161 Value *Res =
3162 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
3163 PLowerBound, PUpperBound, PStride});
3164 Constant *Zero32 = ConstantInt::get(I32Type, 0);
3165 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
3166 Value *LowerBound =
3167 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
3168 Builder.CreateCondBr(MoreWork, Header, Exit);
3169
3170 // Change PHI-node in loop header to use outer cond rather than preheader,
3171 // and set IV to the LowerBound.
3172 Instruction *Phi = &Header->front();
3173 auto *PI = cast<PHINode>(Phi);
3174 PI->setIncomingBlock(0, OuterCond);
3175 PI->setIncomingValue(0, LowerBound);
3176
3177 // Then set the pre-header to jump to the OuterCond
3178 Instruction *Term = PreHeader->getTerminator();
3179 auto *Br = cast<BranchInst>(Term);
3180 Br->setSuccessor(0, OuterCond);
3181
3182 // Modify the inner condition:
3183 // * Use the UpperBound returned from the DynamicNext call.
3184 // * jump to the loop outer loop when done with one of the inner loops.
3185 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
3186 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
3188 auto *CI = cast<CmpInst>(Comp);
3189 CI->setOperand(1, UpperBound);
3190 // Redirect the inner exit to branch to outer condition.
3191 Instruction *Branch = &Cond->back();
3192 auto *BI = cast<BranchInst>(Branch);
3193 assert(BI->getSuccessor(1) == Exit);
3194 BI->setSuccessor(1, OuterCond);
3195
3196 // Call the "fini" function if "ordered" is present in wsloop directive.
3197 if (Ordered) {
3198 Builder.SetInsertPoint(&Latch->back());
3199 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
3200 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
3201 }
3202
3203 // Add the barrier if requested.
3204 if (NeedsBarrier) {
3205 Builder.SetInsertPoint(&Exit->back());
3206 createBarrier(LocationDescription(Builder.saveIP(), DL),
3207 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
3208 /* CheckCancelFlag */ false);
3209 }
3210
3211 CLI->invalidate();
3212 return AfterIP;
3213}
3214
3215/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
3216/// after this \p OldTarget will be orphaned.
3218 BasicBlock *NewTarget, DebugLoc DL) {
3219 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
3220 redirectTo(Pred, NewTarget, DL);
3221}
3222
3223/// Determine which blocks in \p BBs are reachable from outside and remove the
3224/// ones that are not reachable from the function.
3226 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
3227 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
3228 for (Use &U : BB->uses()) {
3229 auto *UseInst = dyn_cast<Instruction>(U.getUser());
3230 if (!UseInst)
3231 continue;
3232 if (BBsToErase.count(UseInst->getParent()))
3233 continue;
3234 return true;
3235 }
3236 return false;
3237 };
3238
3239 while (true) {
3240 bool Changed = false;
3241 for (BasicBlock *BB : make_early_inc_range(BBsToErase)) {
3242 if (HasRemainingUses(BB)) {
3243 BBsToErase.erase(BB);
3244 Changed = true;
3245 }
3246 }
3247 if (!Changed)
3248 break;
3249 }
3250
3251 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
3252 DeleteDeadBlocks(BBVec);
3253}
3254
3257 InsertPointTy ComputeIP) {
3258 assert(Loops.size() >= 1 && "At least one loop required");
3259 size_t NumLoops = Loops.size();
3260
3261 // Nothing to do if there is already just one loop.
3262 if (NumLoops == 1)
3263 return Loops.front();
3264
3265 CanonicalLoopInfo *Outermost = Loops.front();
3266 CanonicalLoopInfo *Innermost = Loops.back();
3267 BasicBlock *OrigPreheader = Outermost->getPreheader();
3268 BasicBlock *OrigAfter = Outermost->getAfter();
3269 Function *F = OrigPreheader->getParent();
3270
3271 // Loop control blocks that may become orphaned later.
3272 SmallVector<BasicBlock *, 12> OldControlBBs;
3273 OldControlBBs.reserve(6 * Loops.size());
3275 Loop->collectControlBlocks(OldControlBBs);
3276
3277 // Setup the IRBuilder for inserting the trip count computation.
3279 if (ComputeIP.isSet())
3280 Builder.restoreIP(ComputeIP);
3281 else
3282 Builder.restoreIP(Outermost->getPreheaderIP());
3283
3284 // Derive the collapsed' loop trip count.
3285 // TODO: Find common/largest indvar type.
3286 Value *CollapsedTripCount = nullptr;
3287 for (CanonicalLoopInfo *L : Loops) {
3288 assert(L->isValid() &&
3289 "All loops to collapse must be valid canonical loops");
3290 Value *OrigTripCount = L->getTripCount();
3291 if (!CollapsedTripCount) {
3292 CollapsedTripCount = OrigTripCount;
3293 continue;
3294 }
3295
3296 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
3297 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
3298 {}, /*HasNUW=*/true);
3299 }
3300
3301 // Create the collapsed loop control flow.
3302 CanonicalLoopInfo *Result =
3303 createLoopSkeleton(DL, CollapsedTripCount, F,
3304 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
3305
3306 // Build the collapsed loop body code.
3307 // Start with deriving the input loop induction variables from the collapsed
3308 // one, using a divmod scheme. To preserve the original loops' order, the
3309 // innermost loop use the least significant bits.
3310 Builder.restoreIP(Result->getBodyIP());
3311
3312 Value *Leftover = Result->getIndVar();
3313 SmallVector<Value *> NewIndVars;
3314 NewIndVars.resize(NumLoops);
3315 for (int i = NumLoops - 1; i >= 1; --i) {
3316 Value *OrigTripCount = Loops[i]->getTripCount();
3317
3318 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
3319 NewIndVars[i] = NewIndVar;
3320
3321 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
3322 }
3323 // Outermost loop gets all the remaining bits.
3324 NewIndVars[0] = Leftover;
3325
3326 // Construct the loop body control flow.
3327 // We progressively construct the branch structure following in direction of
3328 // the control flow, from the leading in-between code, the loop nest body, the
3329 // trailing in-between code, and rejoining the collapsed loop's latch.
3330 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
3331 // the ContinueBlock is set, continue with that block. If ContinuePred, use
3332 // its predecessors as sources.
3333 BasicBlock *ContinueBlock = Result->getBody();
3334 BasicBlock *ContinuePred = nullptr;
3335 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
3336 BasicBlock *NextSrc) {
3337 if (ContinueBlock)
3338 redirectTo(ContinueBlock, Dest, DL);
3339 else
3340 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
3341
3342 ContinueBlock = nullptr;
3343 ContinuePred = NextSrc;
3344 };
3345
3346 // The code before the nested loop of each level.
3347 // Because we are sinking it into the nest, it will be executed more often
3348 // that the original loop. More sophisticated schemes could keep track of what
3349 // the in-between code is and instantiate it only once per thread.
3350 for (size_t i = 0; i < NumLoops - 1; ++i)
3351 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
3352
3353 // Connect the loop nest body.
3354 ContinueWith(Innermost->getBody(), Innermost->getLatch());
3355
3356 // The code after the nested loop at each level.
3357 for (size_t i = NumLoops - 1; i > 0; --i)
3358 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
3359
3360 // Connect the finished loop to the collapsed loop latch.
3361 ContinueWith(Result->getLatch(), nullptr);
3362
3363 // Replace the input loops with the new collapsed loop.
3364 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
3365 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
3366
3367 // Replace the input loop indvars with the derived ones.
3368 for (size_t i = 0; i < NumLoops; ++i)
3369 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
3370
3371 // Remove unused parts of the input loops.
3372 removeUnusedBlocksFromParent(OldControlBBs);
3373
3374 for (CanonicalLoopInfo *L : Loops)
3375 L->invalidate();
3376
3377#ifndef NDEBUG
3378 Result->assertOK();
3379#endif
3380 return Result;
3381}
3382
3383std::vector<CanonicalLoopInfo *>
3385 ArrayRef<Value *> TileSizes) {
3386 assert(TileSizes.size() == Loops.size() &&
3387 "Must pass as many tile sizes as there are loops");
3388 int NumLoops = Loops.size();
3389 assert(NumLoops >= 1 && "At least one loop to tile required");
3390
3391 CanonicalLoopInfo *OutermostLoop = Loops.front();
3392 CanonicalLoopInfo *InnermostLoop = Loops.back();
3393 Function *F = OutermostLoop->getBody()->getParent();
3394 BasicBlock *InnerEnter = InnermostLoop->getBody();
3395 BasicBlock *InnerLatch = InnermostLoop->getLatch();
3396
3397 // Loop control blocks that may become orphaned later.
3398 SmallVector<BasicBlock *, 12> OldControlBBs;
3399 OldControlBBs.reserve(6 * Loops.size());
3401 Loop->collectControlBlocks(OldControlBBs);
3402
3403 // Collect original trip counts and induction variable to be accessible by
3404 // index. Also, the structure of the original loops is not preserved during
3405 // the construction of the tiled loops, so do it before we scavenge the BBs of
3406 // any original CanonicalLoopInfo.
3407 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
3408 for (CanonicalLoopInfo *L : Loops) {
3409 assert(L->isValid() && "All input loops must be valid canonical loops");
3410 OrigTripCounts.push_back(L->getTripCount());
3411 OrigIndVars.push_back(L->getIndVar());
3412 }
3413
3414 // Collect the code between loop headers. These may contain SSA definitions
3415 // that are used in the loop nest body. To be usable with in the innermost
3416 // body, these BasicBlocks will be sunk into the loop nest body. That is,
3417 // these instructions may be executed more often than before the tiling.
3418 // TODO: It would be sufficient to only sink them into body of the
3419 // corresponding tile loop.
3421 for (int i = 0; i < NumLoops - 1; ++i) {
3422 CanonicalLoopInfo *Surrounding = Loops[i];
3423 CanonicalLoopInfo *Nested = Loops[i + 1];
3424
3425 BasicBlock *EnterBB = Surrounding->getBody();
3426 BasicBlock *ExitBB = Nested->getHeader();
3427 InbetweenCode.emplace_back(EnterBB, ExitBB);
3428 }
3429
3430 // Compute the trip counts of the floor loops.
3432 Builder.restoreIP(OutermostLoop->getPreheaderIP());
3433 SmallVector<Value *, 4> FloorCount, FloorRems;
3434 for (int i = 0; i < NumLoops; ++i) {
3435 Value *TileSize = TileSizes[i];
3436 Value *OrigTripCount = OrigTripCounts[i];
3437 Type *IVType = OrigTripCount->getType();
3438
3439 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
3440 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
3441
3442 // 0 if tripcount divides the tilesize, 1 otherwise.
3443 // 1 means we need an additional iteration for a partial tile.
3444 //
3445 // Unfortunately we cannot just use the roundup-formula
3446 // (tripcount + tilesize - 1)/tilesize
3447 // because the summation might overflow. We do not want introduce undefined
3448 // behavior when the untiled loop nest did not.
3449 Value *FloorTripOverflow =
3450 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
3451
3452 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
3453 FloorTripCount =
3454 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
3455 "omp_floor" + Twine(i) + ".tripcount", true);
3456
3457 // Remember some values for later use.
3458 FloorCount.push_back(FloorTripCount);
3459 FloorRems.push_back(FloorTripRem);
3460 }
3461
3462 // Generate the new loop nest, from the outermost to the innermost.
3463 std::vector<CanonicalLoopInfo *> Result;
3464 Result.reserve(NumLoops * 2);
3465
3466 // The basic block of the surrounding loop that enters the nest generated
3467 // loop.
3468 BasicBlock *Enter = OutermostLoop->getPreheader();
3469
3470 // The basic block of the surrounding loop where the inner code should
3471 // continue.
3472 BasicBlock *Continue = OutermostLoop->getAfter();
3473
3474 // Where the next loop basic block should be inserted.
3475 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
3476
3477 auto EmbeddNewLoop =
3478 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
3479 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
3480 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
3481 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
3482 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
3483 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
3484
3485 // Setup the position where the next embedded loop connects to this loop.
3486 Enter = EmbeddedLoop->getBody();
3487 Continue = EmbeddedLoop->getLatch();
3488 OutroInsertBefore = EmbeddedLoop->getLatch();
3489 return EmbeddedLoop;
3490 };
3491
3492 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
3493 const Twine &NameBase) {
3494 for (auto P : enumerate(TripCounts)) {
3495 CanonicalLoopInfo *EmbeddedLoop =
3496 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
3497 Result.push_back(EmbeddedLoop);
3498 }
3499 };
3500
3501 EmbeddNewLoops(FloorCount, "floor");
3502
3503 // Within the innermost floor loop, emit the code that computes the tile
3504 // sizes.
3506 SmallVector<Value *, 4> TileCounts;
3507 for (int i = 0; i < NumLoops; ++i) {
3508 CanonicalLoopInfo *FloorLoop = Result[i];
3509 Value *TileSize = TileSizes[i];
3510
3511 Value *FloorIsEpilogue =
3512 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
3513 Value *TileTripCount =
3514 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
3515
3516 TileCounts.push_back(TileTripCount);
3517 }
3518
3519 // Create the tile loops.
3520 EmbeddNewLoops(TileCounts, "tile");
3521
3522 // Insert the inbetween code into the body.
3523 BasicBlock *BodyEnter = Enter;
3524 BasicBlock *BodyEntered = nullptr;
3525 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
3526 BasicBlock *EnterBB = P.first;
3527 BasicBlock *ExitBB = P.second;
3528
3529 if (BodyEnter)
3530 redirectTo(BodyEnter, EnterBB, DL);
3531 else
3532 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
3533
3534 BodyEnter = nullptr;
3535 BodyEntered = ExitBB;
3536 }
3537
3538 // Append the original loop nest body into the generated loop nest body.
3539 if (BodyEnter)
3540 redirectTo(BodyEnter, InnerEnter, DL);
3541 else
3542 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
3544
3545 // Replace the original induction variable with an induction variable computed
3546 // from the tile and floor induction variables.
3547 Builder.restoreIP(Result.back()->getBodyIP());
3548 for (int i = 0; i < NumLoops; ++i) {
3549 CanonicalLoopInfo *FloorLoop = Result[i];
3550 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
3551 Value *OrigIndVar = OrigIndVars[i];
3552 Value *Size = TileSizes[i];
3553
3554 Value *Scale =
3555 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
3556 Value *Shift =
3557 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
3558 OrigIndVar->replaceAllUsesWith(Shift);
3559 }
3560
3561 // Remove unused parts of the original loops.
3562 removeUnusedBlocksFromParent(OldControlBBs);
3563
3564 for (CanonicalLoopInfo *L : Loops)
3565 L->invalidate();
3566
3567#ifndef NDEBUG
3568 for (CanonicalLoopInfo *GenL : Result)
3569 GenL->assertOK();
3570#endif
3571 return Result;
3572}
3573
3574/// Attach metadata \p Properties to the basic block described by \p BB. If the
3575/// basic block already has metadata, the basic block properties are appended.
3577 ArrayRef<Metadata *> Properties) {
3578 // Nothing to do if no property to attach.
3579 if (Properties.empty())
3580 return;
3581
3582 LLVMContext &Ctx = BB->getContext();
3583 SmallVector<Metadata *> NewProperties;
3584 NewProperties.push_back(nullptr);
3585
3586 // If the basic block already has metadata, prepend it to the new metadata.
3587 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
3588 if (Existing)
3589 append_range(NewProperties, drop_begin(Existing->operands(), 1));
3590
3591 append_range(NewProperties, Properties);
3592 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
3593 BasicBlockID->replaceOperandWith(0, BasicBlockID);
3594
3595 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
3596}
3597
3598/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
3599/// loop already has metadata, the loop properties are appended.
3601 ArrayRef<Metadata *> Properties) {
3602 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
3603
3604 // Attach metadata to the loop's latch
3605 BasicBlock *Latch = Loop->getLatch();
3606 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
3607 addBasicBlockMetadata(Latch, Properties);
3608}
3609
3610/// Attach llvm.access.group metadata to the memref instructions of \p Block
3611static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
3612 LoopInfo &LI) {
3613 for (Instruction &I : *Block) {
3614 if (I.mayReadOrWriteMemory()) {
3615 // TODO: This instruction may already have access group from
3616 // other pragmas e.g. #pragma clang loop vectorize. Append
3617 // so that the existing metadata is not overwritten.
3618 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
3619 }
3620 }
3621}
3622
3626 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3627 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
3628}
3629
3633 Loop, {
3634 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3635 });
3636}
3637
3638void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
3639 Value *IfCond, ValueToValueMapTy &VMap,
3640 const Twine &NamePrefix) {
3641 Function *F = CanonicalLoop->getFunction();
3642
3643 // Define where if branch should be inserted
3644 Instruction *SplitBefore;
3645 if (Instruction::classof(IfCond)) {
3646 SplitBefore = dyn_cast<Instruction>(IfCond);
3647 } else {
3648 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
3649 }
3650
3651 // TODO: We should not rely on pass manager. Currently we use pass manager
3652 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3653 // object. We should have a method which returns all blocks between
3654 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3656 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3657 FAM.registerPass([]() { return LoopAnalysis(); });
3658 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3659
3660 // Get the loop which needs to be cloned
3661 LoopAnalysis LIA;
3662 LoopInfo &&LI = LIA.run(*F, FAM);
3663 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
3664
3665 // Create additional blocks for the if statement
3666 BasicBlock *Head = SplitBefore->getParent();
3667 Instruction *HeadOldTerm = Head->getTerminator();
3668 llvm::LLVMContext &C = Head->getContext();
3670 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
3672 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
3673
3674 // Create if condition branch.
3675 Builder.SetInsertPoint(HeadOldTerm);
3676 Instruction *BrInstr =
3677 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
3678 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
3679 // Then block contains branch to omp loop which needs to be vectorized
3680 spliceBB(IP, ThenBlock, false);
3681 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
3682
3683 Builder.SetInsertPoint(ElseBlock);
3684
3685 // Clone loop for the else branch
3687
3688 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
3689 for (BasicBlock *Block : L->getBlocks()) {
3690 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
3691 NewBB->moveBefore(CanonicalLoop->getExit());
3692 VMap[Block] = NewBB;
3693 NewBlocks.push_back(NewBB);
3694 }
3695 remapInstructionsInBlocks(NewBlocks, VMap);
3696 Builder.CreateBr(NewBlocks.front());
3697}
3698
3699unsigned
3701 const StringMap<bool> &Features) {
3702 if (TargetTriple.isX86()) {
3703 if (Features.lookup("avx512f"))
3704 return 512;
3705 else if (Features.lookup("avx"))
3706 return 256;
3707 return 128;
3708 }
3709 if (TargetTriple.isPPC())
3710 return 128;
3711 if (TargetTriple.isWasm())
3712 return 128;
3713 return 0;
3714}
3715
3717 MapVector<Value *, Value *> AlignedVars,
3718 Value *IfCond, OrderKind Order,
3719 ConstantInt *Simdlen, ConstantInt *Safelen) {
3721
3722 Function *F = CanonicalLoop->getFunction();
3723
3724 // TODO: We should not rely on pass manager. Currently we use pass manager
3725 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3726 // object. We should have a method which returns all blocks between
3727 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3729 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3730 FAM.registerPass([]() { return LoopAnalysis(); });
3731 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3732
3733 LoopAnalysis LIA;
3734 LoopInfo &&LI = LIA.run(*F, FAM);
3735
3736 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
3737 if (AlignedVars.size()) {
3739 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
3740 for (auto &AlignedItem : AlignedVars) {
3741 Value *AlignedPtr = AlignedItem.first;
3742 Value *Alignment = AlignedItem.second;
3743 Builder.CreateAlignmentAssumption(F->getParent()->getDataLayout(),
3744 AlignedPtr, Alignment);
3745 }
3746 Builder.restoreIP(IP);
3747 }
3748
3749 if (IfCond) {
3750 ValueToValueMapTy VMap;
3751 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
3752 // Add metadata to the cloned loop which disables vectorization
3753 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
3754 assert(MappedLatch &&
3755 "Cannot find value which corresponds to original loop latch");
3756 assert(isa<BasicBlock>(MappedLatch) &&
3757 "Cannot cast mapped latch block value to BasicBlock");
3758 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
3759 ConstantAsMetadata *BoolConst =
3762 NewLatchBlock,
3763 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
3764 BoolConst})});
3765 }
3766
3767 SmallSet<BasicBlock *, 8> Reachable;
3768
3769 // Get the basic blocks from the loop in which memref instructions
3770 // can be found.
3771 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
3772 // preferably without running any passes.
3773 for (BasicBlock *Block : L->getBlocks()) {
3774 if (Block == CanonicalLoop->getCond() ||
3775 Block == CanonicalLoop->getHeader())
3776 continue;
3777 Reachable.insert(Block);
3778 }
3779
3780 SmallVector<Metadata *> LoopMDList;
3781
3782 // In presence of finite 'safelen', it may be unsafe to mark all
3783 // the memory instructions parallel, because loop-carried
3784 // dependences of 'safelen' iterations are possible.
3785 // If clause order(concurrent) is specified then the memory instructions
3786 // are marked parallel even if 'safelen' is finite.
3787 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
3788 // Add access group metadata to memory-access instructions.
3789 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
3790 for (BasicBlock *BB : Reachable)
3791 addSimdMetadata(BB, AccessGroup, LI);
3792 // TODO: If the loop has existing parallel access metadata, have
3793 // to combine two lists.
3794 LoopMDList.push_back(MDNode::get(
3795 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
3796 }
3797
3798 // Use the above access group metadata to create loop level
3799 // metadata, which should be distinct for each loop.
3800 ConstantAsMetadata *BoolConst =
3802 LoopMDList.push_back(MDNode::get(
3803 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
3804
3805 if (Simdlen || Safelen) {
3806 // If both simdlen and safelen clauses are specified, the value of the
3807 // simdlen parameter must be less than or equal to the value of the safelen
3808 // parameter. Therefore, use safelen only in the absence of simdlen.
3809 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
3810 LoopMDList.push_back(
3811 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
3812 ConstantAsMetadata::get(VectorizeWidth)}));
3813 }
3814
3815 addLoopMetadata(CanonicalLoop, LoopMDList);
3816}
3817
3818/// Create the TargetMachine object to query the backend for optimization
3819/// preferences.
3820///
3821/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
3822/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
3823/// needed for the LLVM pass pipline. We use some default options to avoid
3824/// having to pass too many settings from the frontend that probably do not
3825/// matter.
3826///
3827/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
3828/// method. If we are going to use TargetMachine for more purposes, especially
3829/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
3830/// might become be worth requiring front-ends to pass on their TargetMachine,
3831/// or at least cache it between methods. Note that while fontends such as Clang
3832/// have just a single main TargetMachine per translation unit, "target-cpu" and
3833/// "target-features" that determine the TargetMachine are per-function and can
3834/// be overrided using __attribute__((target("OPTIONS"))).
3835static std::unique_ptr<TargetMachine>
3837 Module *M = F->getParent();
3838
3839 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
3840 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
3841 const std::string &Triple = M->getTargetTriple();
3842
3843 std::string Error;
3845 if (!TheTarget)
3846 return {};
3847
3849 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
3850 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
3851 /*CodeModel=*/std::nullopt, OptLevel));
3852}
3853
3854/// Heuristically determine the best-performant unroll factor for \p CLI. This
3855/// depends on the target processor. We are re-using the same heuristics as the
3856/// LoopUnrollPass.
3858 Function *F = CLI->getFunction();
3859
3860 // Assume the user requests the most aggressive unrolling, even if the rest of
3861 // the code is optimized using a lower setting.
3863 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
3864
3866 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
3867 FAM.registerPass([]() { return AssumptionAnalysis(); });
3868 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3869 FAM.registerPass([]() { return LoopAnalysis(); });
3870 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
3871 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3872 TargetIRAnalysis TIRA;
3873 if (TM)
3874 TIRA = TargetIRAnalysis(
3875 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
3876 FAM.registerPass([&]() { return TIRA; });
3877
3878 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
3880 ScalarEvolution &&SE = SEA.run(*F, FAM);
3882 DominatorTree &&DT = DTA.run(*F, FAM);
3883 LoopAnalysis LIA;
3884 LoopInfo &&LI = LIA.run(*F, FAM);
3886 AssumptionCache &&AC = ACT.run(*F, FAM);
3888
3889 Loop *L = LI.getLoopFor(CLI->getHeader());
3890 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
3891
3894 /*BlockFrequencyInfo=*/nullptr,
3895 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
3896 /*UserThreshold=*/std::nullopt,
3897 /*UserCount=*/std::nullopt,
3898 /*UserAllowPartial=*/true,
3899 /*UserAllowRuntime=*/true,
3900 /*UserUpperBound=*/std::nullopt,
3901 /*UserFullUnrollMaxCount=*/std::nullopt);
3902
3903 UP.Force = true;
3904
3905 // Account for additional optimizations taking place before the LoopUnrollPass
3906 // would unroll the loop.
3909
3910 // Use normal unroll factors even if the rest of the code is optimized for
3911 // size.
3914
3915 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
3916 << " Threshold=" << UP.Threshold << "\n"
3917 << " PartialThreshold=" << UP.PartialThreshold << "\n"
3918 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
3919 << " PartialOptSizeThreshold="
3920 << UP.PartialOptSizeThreshold << "\n");
3921
3922 // Disable peeling.
3925 /*UserAllowPeeling=*/false,
3926 /*UserAllowProfileBasedPeeling=*/false,
3927 /*UnrollingSpecficValues=*/false);
3928
3930 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
3931
3932 // Assume that reads and writes to stack variables can be eliminated by
3933 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
3934 // size.
3935 for (BasicBlock *BB : L->blocks()) {
3936 for (Instruction &I : *BB) {
3937 Value *Ptr;
3938 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3939 Ptr = Load->getPointerOperand();
3940 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3941 Ptr = Store->getPointerOperand();
3942 } else
3943 continue;
3944
3945 Ptr = Ptr->stripPointerCasts();
3946
3947 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
3948 if (Alloca->getParent() == &F->getEntryBlock())
3949 EphValues.insert(&I);
3950 }
3951 }
3952 }
3953
3954 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
3955
3956 // Loop is not unrollable if the loop contains certain instructions.
3957 if (!UCE.canUnroll() || UCE.Convergent) {
3958 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
3959 return 1;
3960 }
3961
3962 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
3963 << "\n");
3964
3965 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
3966 // be able to use it.
3967 int TripCount = 0;
3968 int MaxTripCount = 0;
3969 bool MaxOrZero = false;
3970 unsigned TripMultiple = 0;
3971
3972 bool UseUpperBound = false;
3973 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
3974 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
3975 UseUpperBound);
3976 unsigned Factor = UP.Count;
3977 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
3978
3979 // This function returns 1 to signal to not unroll a loop.
3980 if (Factor == 0)
3981 return 1;
3982 return Factor;
3983}
3984
3986 int32_t Factor,
3987 CanonicalLoopInfo **UnrolledCLI) {
3988 assert(Factor >= 0 && "Unroll factor must not be negative");
3989
3990 Function *F = Loop->getFunction();
3991 LLVMContext &Ctx = F->getContext();
3992
3993 // If the unrolled loop is not used for another loop-associated directive, it
3994 // is sufficient to add metadata for the LoopUnrollPass.
3995 if (!UnrolledCLI) {
3996 SmallVector<Metadata *, 2> LoopMetadata;
3997 LoopMetadata.push_back(
3998 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
3999
4000 if (Factor >= 1) {
4002 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
4003 LoopMetadata.push_back(MDNode::get(
4004 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
4005 }
4006
4007 addLoopMetadata(Loop, LoopMetadata);
4008 return;
4009 }
4010
4011 // Heuristically determine the unroll factor.
4012 if (Factor == 0)
4014
4015 // No change required with unroll factor 1.
4016 if (Factor == 1) {
4017 *UnrolledCLI = Loop;
4018 return;
4019 }
4020
4021 assert(Factor >= 2 &&
4022 "unrolling only makes sense with a factor of 2 or larger");
4023
4024 Type *IndVarTy = Loop->getIndVarType();
4025
4026 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
4027 // unroll the inner loop.
4028 Value *FactorVal =
4029 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
4030 /*isSigned=*/false));
4031 std::vector<CanonicalLoopInfo *> LoopNest =
4032 tileLoops(DL, {Loop}, {FactorVal});
4033 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
4034 *UnrolledCLI = LoopNest[0];
4035 CanonicalLoopInfo *InnerLoop = LoopNest[1];
4036
4037 // LoopUnrollPass can only fully unroll loops with constant trip count.
4038 // Unroll by the unroll factor with a fallback epilog for the remainder
4039 // iterations if necessary.
4041 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
4043 InnerLoop,
4044 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
4046 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
4047
4048#ifndef NDEBUG
4049 (*UnrolledCLI)->assertOK();
4050#endif
4051}
4052
4055 llvm::Value *BufSize, llvm::Value *CpyBuf,
4056 llvm::Value *CpyFn, llvm::Value *DidIt) {
4057 if (!updateToLocation(Loc))
4058 return Loc.IP;
4059
4060 uint32_t SrcLocStrSize;
4061 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4062 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4063 Value *ThreadId = getOrCreateThreadID(Ident);
4064
4065 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
4066
4067 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
4068
4069 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
4070 Builder.CreateCall(Fn, Args);
4071
4072 return Builder.saveIP();
4073}
4074
4076 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4077 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
4079
4080 if (!updateToLocation(Loc))
4081 return Loc.IP;
4082
4083 // If needed allocate and initialize `DidIt` with 0.
4084 // DidIt: flag variable: 1=single thread; 0=not single thread.
4085 llvm::Value *DidIt = nullptr;
4086 if (!CPVars.empty()) {
4089 }
4090
4091 Directive OMPD = Directive::OMPD_single;
4092 uint32_t SrcLocStrSize;
4093 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4094 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4095 Value *ThreadId = getOrCreateThreadID(Ident);
4096 Value *Args[] = {Ident, ThreadId};
4097
4098 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
4099 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4100
4101 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
4102 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4103
4104 auto FiniCBWrapper = [&](InsertPointTy IP) {
4105 FiniCB(IP);
4106
4107 // The thread that executes the single region must set `DidIt` to 1.
4108 // This is used by __kmpc_copyprivate, to know if the caller is the
4109 // single thread or not.
4110 if (DidIt)
4112 };
4113
4114 // generates the following:
4115 // if (__kmpc_single()) {
4116 // .... single region ...
4117 // __kmpc_end_single
4118 // }
4119 // __kmpc_copyprivate
4120 // __kmpc_barrier
4121
4122 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
4123 /*Conditional*/ true,
4124 /*hasFinalize*/ true);
4125
4126 if (DidIt) {
4127 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
4128 // NOTE BufSize is currently unused, so just pass 0.
4130 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
4131 CPFuncs[I], DidIt);
4132 // NOTE __kmpc_copyprivate already inserts a barrier
4133 } else if (!IsNowait)
4135 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
4136 /* CheckCancelFlag */ false);
4137 return Builder.saveIP();
4138}
4139
4141 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4142 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
4143
4144 if (!updateToLocation(Loc))
4145 return Loc.IP;
4146
4147 Directive OMPD = Directive::OMPD_critical;
4148 uint32_t SrcLocStrSize;
4149 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4150 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4151 Value *ThreadId = getOrCreateThreadID(Ident);
4152 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
4153 Value *Args[] = {Ident, ThreadId, LockVar};
4154
4155 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
4156 Function *RTFn = nullptr;
4157 if (HintInst) {
4158 // Add Hint to entry Args and create call
4159 EnterArgs.push_back(HintInst);
4160 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
4161 } else {
4162 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
4163 }
4164 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
4165
4166 Function *ExitRTLFn =
4167 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
4168 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4169
4170 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4171 /*Conditional*/ false, /*hasFinalize*/ true);
4172}
4173
4176 InsertPointTy AllocaIP, unsigned NumLoops,
4177 ArrayRef<llvm::Value *> StoreValues,
4178 const Twine &Name, bool IsDependSource) {
4179 assert(
4180 llvm::all_of(StoreValues,
4181 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
4182 "OpenMP runtime requires depend vec with i64 type");
4183
4184 if (!updateToLocation(Loc))
4185 return Loc.IP;
4186
4187 // Allocate space for vector and generate alloc instruction.
4188 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
4189 Builder.restoreIP(AllocaIP);
4190 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
4191 ArgsBase->setAlignment(Align(8));
4192 Builder.restoreIP(Loc.IP);
4193
4194 // Store the index value with offset in depend vector.
4195 for (unsigned I = 0; I < NumLoops; ++I) {
4196 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
4197 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
4198 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
4199 STInst->setAlignment(Align(8));
4200 }
4201
4202 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
4203 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
4204
4205 uint32_t SrcLocStrSize;
4206 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4207 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4208 Value *ThreadId = getOrCreateThreadID(Ident);
4209 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
4210
4211 Function *RTLFn = nullptr;
4212 if (IsDependSource)
4213 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
4214 else
4215 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
4216 Builder.CreateCall(RTLFn, Args);
4217
4218 return Builder.saveIP();
4219}
4220
4222 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4223 FinalizeCallbackTy FiniCB, bool IsThreads) {
4224 if (!updateToLocation(Loc))
4225 return Loc.IP;
4226
4227 Directive OMPD = Directive::OMPD_ordered;
4228 Instruction *EntryCall = nullptr;
4229 Instruction *ExitCall = nullptr;
4230
4231 if (IsThreads) {
4232 uint32_t SrcLocStrSize;
4233 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4234 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4235 Value *ThreadId = getOrCreateThreadID(Ident);
4236 Value *Args[] = {Ident, ThreadId};
4237
4238 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
4239 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4240
4241 Function *ExitRTLFn =
4242 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
4243 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4244 }
4245
4246 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4247 /*Conditional*/ false, /*hasFinalize*/ true);
4248}
4249
4250OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
4251 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
4252 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
4253 bool HasFinalize, bool IsCancellable) {
4254
4255 if (HasFinalize)
4256 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
4257
4258 // Create inlined region's entry and body blocks, in preparation
4259 // for conditional creation
4260 BasicBlock *EntryBB = Builder.GetInsertBlock();
4261 Instruction *SplitPos = EntryBB->getTerminator();
4262 if (!isa_and_nonnull<BranchInst>(SplitPos))
4263 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
4264 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
4265 BasicBlock *FiniBB =
4266 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
4267
4269 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
4270
4271 // generate body
4272 BodyGenCB(/* AllocaIP */ InsertPointTy(),
4273 /* CodeGenIP */ Builder.saveIP());
4274
4275 // emit exit call and do any needed finalization.
4276 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
4277 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
4278 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
4279 "Unexpected control flow graph state!!");
4280 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
4281 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
4282 "Unexpected Control Flow State!");
4284
4285 // If we are skipping the region of a non conditional, remove the exit
4286 // block, and clear the builder's insertion point.
4287 assert(SplitPos->getParent() == ExitBB &&
4288 "Unexpected Insertion point location!");
4289 auto merged = MergeBlockIntoPredecessor(ExitBB);
4290 BasicBlock *ExitPredBB = SplitPos->getParent();
4291 auto InsertBB = merged ? ExitPredBB : ExitBB;
4292 if (!isa_and_nonnull<BranchInst>(SplitPos))
4293 SplitPos->eraseFromParent();
4294 Builder.SetInsertPoint(InsertBB);
4295
4296 return Builder.saveIP();
4297}
4298
4299OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
4300 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
4301 // if nothing to do, Return current insertion point.
4302 if (!Conditional || !EntryCall)
4303 return Builder.saveIP();
4304
4305 BasicBlock *EntryBB = Builder.GetInsertBlock();
4306 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
4307 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
4308 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
4309
4310 // Emit thenBB and set the Builder's insertion point there for
4311 // body generation next. Place the block after the current block.
4312 Function *CurFn = EntryBB->getParent();
4313 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
4314
4315 // Move Entry branch to end of ThenBB, and replace with conditional
4316 // branch (If-stmt)
4317 Instruction *EntryBBTI = EntryBB->getTerminator();
4318 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
4319 EntryBBTI->removeFromParent();
4321 Builder.Insert(EntryBBTI);
4322 UI->eraseFromParent();
4323 Builder.SetInsertPoint(ThenBB->getTerminator());
4324
4325 // return an insertion point to ExitBB.
4326 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
4327}
4328
4329OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
4330 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
4331 bool HasFinalize) {
4332
4333 Builder.restoreIP(FinIP);
4334
4335 // If there is finalization to do, emit it before the exit call
4336 if (HasFinalize) {
4337 assert(!FinalizationStack.empty() &&
4338 "Unexpected finalization stack state!");
4339
4340 FinalizationInfo Fi = FinalizationStack.pop_back_val();
4341 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
4342
4343 Fi.FiniCB(FinIP);
4344
4345 BasicBlock *FiniBB = FinIP.getBlock();
4346 Instruction *FiniBBTI = FiniBB->getTerminator();
4347
4348 // set Builder IP for call creation
4349 Builder.SetInsertPoint(FiniBBTI);
4350 }
4351
4352 if (!ExitCall)
4353 return Builder.saveIP();
4354
4355 // place the Exitcall as last instruction before Finalization block terminator
4356 ExitCall->removeFromParent();
4357 Builder.Insert(ExitCall);
4358
4359 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
4360 ExitCall->getIterator());
4361}
4362
4364 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
4365 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
4366 if (!IP.isSet())
4367 return IP;
4368
4370
4371 // creates the following CFG structure
4372 // OMP_Entry : (MasterAddr != PrivateAddr)?
4373 // F T
4374 // | \
4375 // | copin.not.master
4376 // | /
4377 // v /
4378 // copyin.not.master.end
4379 // |
4380 // v
4381 // OMP.Entry.Next
4382
4383 BasicBlock *OMP_Entry = IP.getBlock();
4384 Function *CurFn = OMP_Entry->getParent();
4385 BasicBlock *CopyBegin =
4386 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
4387 BasicBlock *CopyEnd = nullptr;
4388
4389 // If entry block is terminated, split to preserve the branch to following
4390 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
4391 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
4392 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
4393 "copyin.not.master.end");
4394 OMP_Entry->getTerminator()->eraseFromParent();
4395 } else {
4396 CopyEnd =
4397 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
4398 }
4399
4400 Builder.SetInsertPoint(OMP_Entry);
4401 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
4402 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
4403 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
4404 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
4405
4406 Builder.SetInsertPoint(CopyBegin);
4407 if (BranchtoEnd)
4409
4410 return Builder.saveIP();
4411}
4412
4414 Value *Size, Value *Allocator,
4415 std::string Name) {
4417 updateToLocation(Loc);
4418
4419 uint32_t SrcLocStrSize;
4420 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4421 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4422 Value *ThreadId = getOrCreateThreadID(Ident);
4423 Value *Args[] = {ThreadId, Size, Allocator};
4424
4425 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
4426
4427 return Builder.CreateCall(Fn, Args, Name);
4428}
4429
4431 Value *Addr, Value *Allocator,
4432 std::string Name) {
4434 updateToLocation(Loc);
4435
4436 uint32_t SrcLocStrSize;
4437 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4438 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4439 Value *ThreadId = getOrCreateThreadID(Ident);
4440 Value *Args[] = {ThreadId, Addr, Allocator};
4441 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
4442 return Builder.CreateCall(Fn, Args, Name);
4443}
4444
4446 const LocationDescription &Loc, Value *InteropVar,
4447 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
4448 Value *DependenceAddress, bool HaveNowaitClause) {
4450 updateToLocation(Loc);
4451
4452 uint32_t SrcLocStrSize;
4453 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4454 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4455 Value *ThreadId = getOrCreateThreadID(Ident);
4456 if (Device == nullptr)
4457 Device = ConstantInt::get(Int32, -1);
4458 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
4459 if (NumDependences == nullptr) {
4460 NumDependences = ConstantInt::get(Int32, 0);
4461 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4462 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4463 }
4464 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4465 Value *Args[] = {
4466 Ident, ThreadId, InteropVar, InteropTypeVal,
4467 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
4468
4469 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
4470
4471 return Builder.CreateCall(Fn, Args);
4472}
4473
4475 const LocationDescription &Loc, Value *InteropVar, Value *Device,
4476 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
4478 updateToLocation(Loc);
4479
4480 uint32_t SrcLocStrSize;
4481 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4482 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4483 Value *ThreadId = getOrCreateThreadID(Ident);
4484 if (Device == nullptr)
4485 Device = ConstantInt::get(Int32, -1);
4486 if (NumDependences == nullptr) {
4487 NumDependences = ConstantInt::get(Int32, 0);
4488 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4489 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4490 }
4491 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4492 Value *Args[] = {
4493 Ident, ThreadId, InteropVar, Device,
4494 NumDependences, DependenceAddress, HaveNowaitClauseVal};
4495
4496 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
4497
4498 return Builder.CreateCall(Fn, Args);
4499}
4500
4502 Value *InteropVar, Value *Device,
4503 Value *NumDependences,
4504 Value *DependenceAddress,
4505 bool HaveNowaitClause) {
4507 updateToLocation(Loc);
4508 uint32_t SrcLocStrSize;
4509 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4510 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4511 Value *ThreadId = getOrCreateThreadID(Ident);
4512 if (Device == nullptr)
4513 Device = ConstantInt::get(Int32, -1);
4514 if (NumDependences == nullptr) {
4515 NumDependences = ConstantInt::get(Int32, 0);
4516 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4517 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4518 }
4519 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4520 Value *Args[] = {
4521 Ident, ThreadId, InteropVar, Device,
4522 NumDependences, DependenceAddress, HaveNowaitClauseVal};
4523
4524 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
4525
4526 return Builder.CreateCall(Fn, Args);
4527}
4528
4530 const LocationDescription &Loc, llvm::Value *Pointer,
4533 updateToLocation(Loc);
4534
4535 uint32_t SrcLocStrSize;
4536 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4537 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4538 Value *ThreadId = getOrCreateThreadID(Ident);
4539 Constant *ThreadPrivateCache =
4540 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
4541 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
4542
4543 Function *Fn =
4544 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
4545
4546 return Builder.CreateCall(Fn, Args);
4547}
4548
4551 int32_t MinThreadsVal, int32_t MaxThreadsVal,
4552 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
4553 if (!updateToLocation(Loc))
4554 return Loc.IP;
4555
4556 uint32_t SrcLocStrSize;
4557 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4558 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4559 Constant *IsSPMDVal = ConstantInt::getSigned(
4561 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
4562 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
4563 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
4564
4566
4567 // Manifest the launch configuration in the metadata matching the kernel
4568 // environment.
4569 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
4570 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
4571
4572 // For max values, < 0 means unset, == 0 means set but unknown.
4573 if (MaxThreadsVal < 0)
4574 MaxThreadsVal = std::max(
4575 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
4576
4577 if (MaxThreadsVal > 0)
4578 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
4579
4580 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
4582 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
4583 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
4584 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
4585 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
4586
4587 // We need to strip the debug prefix to get the correct kernel name.
4588 StringRef KernelName = Kernel->getName();
4589 const std::string DebugPrefix = "_debug__";
4590 if (KernelName.ends_with(DebugPrefix))
4591 KernelName = KernelName.drop_back(DebugPrefix.length());
4592
4594 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
4595 const DataLayout &DL = Fn->getParent()->getDataLayout();
4596
4597 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
4598 Constant *DynamicEnvironmentInitializer =
4599 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
4600 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
4601 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
4602 DynamicEnvironmentInitializer, DynamicEnvironmentName,
4603 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
4604 DL.getDefaultGlobalsAddressSpace());
4605 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
4606
4607 Constant *DynamicEnvironment =
4608 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
4609 ? DynamicEnvironmentGV
4610 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
4611 DynamicEnvironmentPtr);
4612
4613 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
4614 ConfigurationEnvironment, {
4615 UseGenericStateMachineVal,
4616 MayUseNestedParallelismVal,
4617 IsSPMDVal,
4618 MinThreads,
4619 MaxThreads,
4620 MinTeams,
4621 MaxTeams,
4622 ReductionDataSize,
4623 ReductionBufferLength,
4624 });
4625 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
4626 KernelEnvironment, {
4627 ConfigurationEnvironmentInitializer,
4628 Ident,
4629 DynamicEnvironment,
4630 });
4631 Twine KernelEnvironmentName = KernelName + "_kernel_environment";
4632 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
4633 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
4634 KernelEnvironmentInitializer, KernelEnvironmentName,
4635 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
4636 DL.getDefaultGlobalsAddressSpace());
4637 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
4638
4639 Constant *KernelEnvironment =
4640 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
4641 ? KernelEnvironmentGV
4642 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
4643 KernelEnvironmentPtr);
4644 Value *KernelLaunchEnvironment = Kernel->getArg(0);
4645 CallInst *ThreadKind =
4646 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
4647
4648 Value *ExecUserCode = Builder.CreateICmpEQ(
4649 ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
4650 "exec_user_code");
4651
4652 // ThreadKind = __kmpc_target_init(...)
4653 // if (ThreadKind == -1)
4654 // user_code
4655 // else
4656 // return;
4657
4658 auto *UI = Builder.CreateUnreachable();
4659 BasicBlock *CheckBB = UI->getParent();
4660 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
4661
4662 BasicBlock *WorkerExitBB = BasicBlock::Create(
4663 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
4664 Builder.SetInsertPoint(WorkerExitBB);
4666
4667 auto *CheckBBTI = CheckBB->getTerminator();
4668 Builder.SetInsertPoint(CheckBBTI);
4669 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
4670
4671 CheckBBTI->eraseFromParent();
4672 UI->eraseFromParent();
4673
4674 // Continue in the "user_code" block, see diagram above and in
4675 // openmp/libomptarget/deviceRTLs/common/include/target.h .
4676 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
4677}
4678
4680 int32_t TeamsReductionDataSize,
4681 int32_t TeamsReductionBufferLength) {
4682 if (!updateToLocation(Loc))
4683 return;
4684
4686 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
4687
4688 Builder.CreateCall(Fn, {});
4689
4690 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
4691 return;
4692
4694 // We need to strip the debug prefix to get the correct kernel name.
4695 StringRef KernelName = Kernel->getName();
4696 const std::string DebugPrefix = "_debug__";
4697 if (KernelName.ends_with(DebugPrefix))
4698 KernelName = KernelName.drop_back(DebugPrefix.length());
4699 auto *KernelEnvironmentGV =
4700 M.getNamedGlobal((KernelName + "_kernel_environment").str());
4701 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
4702 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
4703 auto *NewInitializer = ConstantFoldInsertValueInstruction(
4704 KernelEnvironmentInitializer,
4705 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
4706 NewInitializer = ConstantFoldInsertValueInstruction(
4707 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
4708 {0, 8});
4709 KernelEnvironmentGV->setInitializer(NewInitializer);
4710}
4711
4713 Module &M = *Kernel.getParent();
4714 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4715 for (auto *Op : MD->operands()) {
4716 if (Op->getNumOperands() != 3)
4717 continue;
4718 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
4719 if (!KernelOp || KernelOp->getValue() != &Kernel)
4720 continue;
4721 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
4722 if (!Prop || Prop->getString() != Name)
4723 continue;
4724 return Op;
4725 }
4726 return nullptr;
4727}
4728
4730 bool Min) {
4731 // Update the "maxntidx" metadata for NVIDIA, or add it.
4732 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
4733 if (ExistingOp) {
4734 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
4735 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4736 ExistingOp->replaceOperandWith(
4737 2, ConstantAsMetadata::get(ConstantInt::get(
4738 OldVal->getValue()->getType(),
4739 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
4740 } else {
4741 LLVMContext &Ctx = Kernel.getContext();
4743 MDString::get(Ctx, Name),
4745 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
4746 // Append metadata to nvvm.annotations
4747 Module &M = *Kernel.getParent();
4748 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4749 MD->addOperand(MDNode::get(Ctx, MDVals));
4750 }
4751}
4752
4753std::pair<int32_t, int32_t>
4755 int32_t ThreadLimit =
4756 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
4757
4758 if (T.isAMDGPU()) {
4759 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
4760 if (!Attr.isValid() || !Attr.isStringAttribute())
4761 return {0, ThreadLimit};
4762 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
4763 int32_t LB, UB;
4764 if (!llvm::to_integer(UBStr, UB, 10))
4765 return {0, ThreadLimit};
4766 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
4767 if (!llvm::to_integer(LBStr, LB, 10))
4768 return {0, UB};
4769 return {LB, UB};
4770 }
4771
4772 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
4773 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
4774 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4775 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
4776 }
4777 return {0, ThreadLimit};
4778}
4779
4781 Function &Kernel, int32_t LB,
4782 int32_t UB) {
4783 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
4784
4785 if (T.isAMDGPU()) {
4786 Kernel.addFnAttr("amdgpu-flat-work-group-size",
4787 llvm::utostr(LB) + "," + llvm::utostr(UB));
4788 return;
4789 }
4790
4791 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
4792}
4793
4794std::pair<int32_t, int32_t>
4796 // TODO: Read from backend annotations if available.
4797 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
4798}
4799
4801 int32_t LB, int32_t UB) {
4802 if (T.isNVPTX())
4803 if (UB > 0)
4804 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
4805 if (T.isAMDGPU())
4806 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
4807
4808 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
4809}
4810
4811void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
4812 Function *OutlinedFn) {
4813 if (Config.isTargetDevice()) {
4815 // TODO: Determine if DSO local can be set to true.
4816 OutlinedFn->setDSOLocal(false);
4818 if (T.isAMDGCN())
4820 }
4821}
4822
4823Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
4824 StringRef EntryFnIDName) {
4825 if (Config.isTargetDevice()) {
4826 assert(OutlinedFn && "The outlined function must exist if embedded");
4827 return OutlinedFn;
4828 }
4829
4830 return new GlobalVariable(
4831 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
4832 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
4833}
4834
4835Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
4836 StringRef EntryFnName) {
4837 if (OutlinedFn)
4838 return OutlinedFn;
4839
4840 assert(!M.getGlobalVariable(EntryFnName, true) &&
4841 "Named kernel already exists?");
4842 return new GlobalVariable(
4843 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
4844 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
4845}
4846
4848 TargetRegionEntryInfo &EntryInfo,
4849 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
4850 Function *&OutlinedFn, Constant *&OutlinedFnID) {
4851
4852 SmallString<64> EntryFnName;
4853 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
4854
4856 ? GenerateFunctionCallback(EntryFnName)
4857 : nullptr;
4858
4859 // If this target outline function is not an offload entry, we don't need to
4860 // register it. This may be in the case of a false if clause, or if there are
4861 // no OpenMP targets.
4862 if (!IsOffloadEntry)
4863 return;
4864
4865 std::string EntryFnIDName =
4867 ? std::string(EntryFnName)
4868 : createPlatformSpecificName({EntryFnName, "region_id"});
4869
4870 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
4871 EntryFnName, EntryFnIDName);
4872}
4873
4875 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
4876 StringRef EntryFnName, StringRef EntryFnIDName) {
4877 if (OutlinedFn)
4878 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
4879 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
4880 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
4882 EntryInfo, EntryAddr, OutlinedFnID,
4884 return OutlinedFnID;
4885}
4886
4888 const LocationDescription &Loc, InsertPointTy AllocaIP,
4889 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
4890 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
4891 omp::RuntimeFunction *MapperFunc,
4892 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
4893 BodyGenCB,
4894 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
4895 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
4896 if (!updateToLocation(Loc))
4897 return InsertPointTy();
4898
4899 // Disable TargetData CodeGen on Device pass.
4900 if (Config.IsTargetDevice.value_or(false)) {
4901 if (BodyGenCB)
4903 return Builder.saveIP();
4904 }
4905
4906 Builder.restoreIP(CodeGenIP);
4907 bool IsStandAlone = !BodyGenCB;
4908 MapInfosTy *MapInfo;
4909 // Generate the code for the opening of the data environment. Capture all the
4910 // arguments of the runtime call by reference because they are used in the
4911 // closing of the region.
4912 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4913 MapInfo = &GenMapInfoCB(Builder.saveIP());
4914 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
4915 /*IsNonContiguous=*/true, DeviceAddrCB,
4916 CustomMapperCB);
4917
4918 TargetDataRTArgs RTArgs;
4920 !MapInfo->Names.empty());
4921
4922 // Emit the number of elements in the offloading arrays.
4923 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
4924
4925 // Source location for the ident struct
4926 if (!SrcLocInfo) {
4927 uint32_t SrcLocStrSize;
4928 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4929 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4930 }
4931
4932 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
4933 PointerNum, RTArgs.BasePointersArray,
4934 RTArgs.PointersArray, RTArgs.SizesArray,
4935 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
4936 RTArgs.MappersArray};
4937
4938 if (IsStandAlone) {
4939 assert(MapperFunc && "MapperFunc missing for standalone target data");
4941 OffloadingArgs);
4942 } else {
4943 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
4944 omp::OMPRTL___tgt_target_data_begin_mapper);
4945
4946 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
4947
4948 for (auto DeviceMap : Info.DevicePtrInfoMap) {
4949 if (isa<AllocaInst>(DeviceMap.second.second)) {
4950 auto *LI =
4951 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
4952 Builder.CreateStore(LI, DeviceMap.second.second);
4953 }
4954 }
4955
4956 // If device pointer privatization is required, emit the body of the
4957 // region here. It will have to be duplicated: with and without
4958 // privatization.
4960 }
4961 };
4962
4963 // If we need device pointer privatization, we need to emit the body of the
4964 // region with no privatization in the 'else' branch of the conditional.
4965 // Otherwise, we don't have to do anything.
4966 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4968 };
4969
4970 // Generate code for the closing of the data region.
4971 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4972 TargetDataRTArgs RTArgs;
4973 emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
4974 /*ForEndCall=*/true);
4975
4976 // Emit the number of elements in the offloading arrays.
4977 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
4978
4979 // Source location for the ident struct
4980 if (!SrcLocInfo) {
4981 uint32_t SrcLocStrSize;
4982 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4983 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4984 }
4985
4986 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
4987 PointerNum, RTArgs.BasePointersArray,
4988 RTArgs.PointersArray, RTArgs.SizesArray,
4989 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
4990 RTArgs.MappersArray};
4991 Function *EndMapperFunc =
4992 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
4993
4994 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
4995 };
4996
4997 // We don't have to do anything to close the region if the if clause evaluates
4998 // to false.
4999 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
5000
5001 if (BodyGenCB) {
5002 if (IfCond) {
5003 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
5004 } else {
5005 BeginThenGen(AllocaIP, Builder.saveIP());
5006 }
5007
5008 // If we don't require privatization of device pointers, we emit the body in
5009 // between the runtime calls. This avoids duplicating the body code.
5011
5012 if (IfCond) {
5013 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
5014 } else {
5015 EndThenGen(AllocaIP, Builder.saveIP());
5016 }
5017 } else {
5018 if (IfCond) {
5019 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
5020 } else {
5021 BeginThenGen(AllocaIP, Builder.saveIP());
5022 }
5023 }
5024
5025 return Builder.saveIP();
5026}
5027
5030 bool IsGPUDistribute) {
5031 assert((IVSize == 32 || IVSize == 64) &&
5032 "IV size is not compatible with the omp runtime");
5034 if (IsGPUDistribute)
5035 Name = IVSize == 32
5036 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
5037 : omp::OMPRTL___kmpc_distribute_static_init_4u)
5038 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
5039 : omp::OMPRTL___kmpc_distribute_static_init_8u);
5040 else
5041 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
5042 : omp::OMPRTL___kmpc_for_static_init_4u)
5043 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
5044 : omp::OMPRTL___kmpc_for_static_init_8u);
5045
5047}
5048
5050 bool IVSigned) {
5051 assert((IVSize == 32 || IVSize == 64) &&
5052 "IV size is not compatible with the omp runtime");
5053 RuntimeFunction Name = IVSize == 32
5054 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
5055 : omp::OMPRTL___kmpc_dispatch_init_4u)
5056 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
5057 : omp::OMPRTL___kmpc_dispatch_init_8u);
5058
5060}
5061
5063 bool IVSigned) {
5064 assert((IVSize == 32 || IVSize == 64) &&
5065 "IV size is not compatible with the omp runtime");
5066 RuntimeFunction Name = IVSize == 32
5067 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
5068 : omp::OMPRTL___kmpc_dispatch_next_4u)
5069 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
5070 : omp::OMPRTL___kmpc_dispatch_next_8u);
5071
5073}
5074
5076 bool IVSigned) {
5077 assert((IVSize == 32 || IVSize == 64) &&
5078 "IV size is not compatible with the omp runtime");
5079 RuntimeFunction Name = IVSize == 32
5080 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
5081 : omp::OMPRTL___kmpc_dispatch_fini_4u)
5082 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
5083 : omp::OMPRTL___kmpc_dispatch_fini_8u);
5084
5086}
5087
5089 Function *Func) {
5090 for (User *User : make_early_inc_range(ConstExpr->users())) {
5091 if (auto *Instr = dyn_cast<Instruction>(User)) {
5092 if (Instr->getFunction() == Func) {
5093 Instruction *ConstInst = ConstExpr->getAsInstruction();
5094 ConstInst->insertBefore(*Instr->getParent(), Instr->getIterator());
5095 Instr->replaceUsesOfWith(ConstExpr, ConstInst);
5096 }
5097 }
5098 }
5099}
5100
5102 Function *Func) {
5103 for (User *User : make_early_inc_range(Input->users()))
5104 if (auto *Const = dyn_cast<Constant>(User))
5105 if (auto *ConstExpr = dyn_cast<ConstantExpr>(Const))
5107}
5108
5110 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
5114 SmallVector<Type *> ParameterTypes;
5115 if (OMPBuilder.Config.isTargetDevice()) {
5116 // Add the "implicit" runtime argument we use to provide launch specific
5117 // information for target devices.
5118 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
5119 ParameterTypes.push_back(Int8PtrTy);
5120
5121 // All parameters to target devices are passed as pointers
5122 // or i64. This assumes 64-bit address spaces/pointers.
5123 for (auto &Arg : Inputs)
5124 ParameterTypes.push_back(Arg->getType()->isPointerTy()
5125 ? Arg->getType()
5126 : Type::getInt64Ty(Builder.getContext()));
5127 } else {
5128 for (auto &Arg : Inputs)
5129 ParameterTypes.push_back(Arg->getType());
5130 }
5131
5132 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
5133 /*isVarArg*/ false);
5134 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
5135 Builder.GetInsertBlock()->getModule());
5136
5137 // Save insert point.
5138 auto OldInsertPoint = Builder.saveIP();
5139
5140 // Generate the region into the function.
5141 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
5142 Builder.SetInsertPoint(EntryBB);
5143
5144 // Insert target init call in the device compilation pass.
5145 if (OMPBuilder.Config.isTargetDevice())
5146 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
5147
5148 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
5149
5150 // As we embed the user code in the middle of our target region after we
5151 // generate entry code, we must move what allocas we can into the entry
5152 // block to avoid possible breaking optimisations for device
5153 if (OMPBuilder.Config.isTargetDevice())
5155
5156 // Insert target deinit call in the device compilation pass.
5157 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
5158 if (OMPBuilder.Config.isTargetDevice())
5159 OMPBuilder.createTargetDeinit(Builder);
5160
5161 // Insert return instruction.
5162 Builder.CreateRetVoid();
5163
5164 // New Alloca IP at entry point of created device function.
5165 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
5166 auto AllocaIP = Builder.saveIP();
5167
5168 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
5169
5170 // Skip the artificial dyn_ptr on the device.
5171 const auto &ArgRange =
5172 OMPBuilder.Config.isTargetDevice()
5173 ? make_range(Func->arg_begin() + 1, Func->arg_end())
5174 : Func->args();
5175
5176 // Rewrite uses of input valus to parameters.
5177 for (auto InArg : zip(Inputs, ArgRange)) {
5178 Value *Input = std::get<0>(InArg);
5179 Argument &Arg = std::get<1>(InArg);
5180 Value *InputCopy = nullptr;
5181
5182 Builder.restoreIP(
5183 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
5184
5185 // Things like GEP's can come in the form of Constants. Constants and
5186 // ConstantExpr's do not have access to the knowledge of what they're
5187 // contained in, so we must dig a little to find an instruction so we can
5188 // tell if they're used inside of the function we're outlining. We also
5189 // replace the original constant expression with a new instruction
5190 // equivalent; an instruction as it allows easy modification in the
5191 // following loop, as we can now know the constant (instruction) is owned by
5192 // our target function and replaceUsesOfWith can now be invoked on it
5193 // (cannot do this with constants it seems). A brand new one also allows us
5194 // to be cautious as it is perhaps possible the old expression was used
5195 // inside of the function but exists and is used externally (unlikely by the
5196 // nature of a Constant, but still).
5198
5199 // Collect all the instructions
5200 for (User *User : make_early_inc_range(Input->users()))
5201 if (auto *Instr = dyn_cast<Instruction>(User))
5202 if (Instr->getFunction() == Func)
5203 Instr->replaceUsesOfWith(Input, InputCopy);
5204 }
5205
5206 // Restore insert point.
5207 Builder.restoreIP(OldInsertPoint);
5208
5209 return Func;
5210}
5211
5213 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
5214 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
5215 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
5218
5219 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
5220 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
5221 &ArgAccessorFuncCB](StringRef EntryFnName) {
5222 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
5223 CBFunc, ArgAccessorFuncCB);
5224 };
5225
5226 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
5227 OutlinedFn, OutlinedFnID);
5228}
5229
5230static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
5232 Function *OutlinedFn, Constant *OutlinedFnID,
5233 int32_t NumTeams, int32_t NumThreads,
5236
5238 /*RequiresDevicePointerInfo=*/false,
5239 /*SeparateBeginEndCalls=*/true);
5240
5241 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
5242 OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
5243 /*IsNonContiguous=*/true);
5244
5246 OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
5247 !MapInfo.Names.empty());
5248
5249 // emitKernelLaunch
5250 auto &&EmitTargetCallFallbackCB =
5252 Builder.restoreIP(IP);
5253 Builder.CreateCall(OutlinedFn, Args);
5254 return Builder.saveIP();
5255 };
5256
5257 unsigned NumTargetItems = MapInfo.BasePointers.size();
5258 // TODO: Use correct device ID
5259 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
5260 Value *NumTeamsVal = Builder.getInt32(NumTeams);
5261 Value *NumThreadsVal = Builder.getInt32(NumThreads);
5262 uint32_t SrcLocStrSize;
5263 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
5264 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
5265 llvm::omp::IdentFlag(0), 0);
5266 // TODO: Use correct NumIterations
5267 Value *NumIterations = Builder.getInt64(0);
5268 // TODO: Use correct DynCGGroupMem
5269 Value *DynCGGroupMem = Builder.getInt32(0);
5270
5271 bool HasNoWait = false;
5272
5273 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
5274 NumTeamsVal, NumThreadsVal,
5275 DynCGGroupMem, HasNoWait);
5276
5277 Builder.restoreIP(OMPBuilder.emitKernelLaunch(
5278 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
5279 DeviceID, RTLoc, AllocaIP));
5280}
5281
5283 const LocationDescription &Loc, InsertPointTy AllocaIP,
5284 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
5285 int32_t NumThreads, SmallVectorImpl<Value *> &Args,
5286 GenMapInfoCallbackTy GenMapInfoCB,
5289 if (!updateToLocation(Loc))
5290 return InsertPointTy();
5291
5292 Builder.restoreIP(CodeGenIP);
5293
5294 Function *OutlinedFn;
5295 Constant *OutlinedFnID;
5296 emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
5297 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
5298 if (!Config.isTargetDevice())
5299 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
5300 NumThreads, Args, GenMapInfoCB);
5301
5302 return Builder.saveIP();
5303}
5304
5305std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
5306 StringRef FirstSeparator,
5307 StringRef Separator) {
5308 SmallString<128> Buffer;
5310 StringRef Sep = FirstSeparator;
5311 for (StringRef Part : Parts) {
5312 OS << Sep << Part;
5313 Sep = Separator;
5314 }
5315 return OS.str().str();
5316}
5317
5318std::string
5320 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
5321 Config.separator());
5322}
5323
5326 unsigned AddressSpace) {
5327 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
5328 if (Elem.second) {
5329 assert(Elem.second->getValueType() == Ty &&
5330 "OMP internal variable has different type than requested");
5331 } else {
5332 // TODO: investigate the appropriate linkage type used for the global
5333 // variable for possibly changing that to internal or private, or maybe
5334 // create different versions of the function for different OMP internal
5335 // variables.
5336 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
5339 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
5340 Constant::getNullValue(Ty), Elem.first(),
5341 /*InsertBefore=*/nullptr,
5343 const DataLayout &DL = M.getDataLayout();
5344 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
5345 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
5346 GV->setAlignment(std::max(TypeAlign, PtrAlign));
5347 Elem.second = GV;
5348 }
5349
5350 return Elem.second;
5351}
5352
5353Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
5354 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
5355 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
5356 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
5357}
5358
5361 Value *Null =
5362 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
5363 Value *SizeGep =
5364 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
5365 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
5366 return SizePtrToInt;
5367}
5368
5371 std::string VarName) {
5372 llvm::Constant *MaptypesArrayInit =
5374 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
5375 M, MaptypesArrayInit->getType(),
5376 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
5377 VarName);
5378 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
5379 return MaptypesArrayGlobal;
5380}
5381
5383 InsertPointTy AllocaIP,
5384 unsigned NumOperands,
5385 struct MapperAllocas &MapperAllocas) {
5386 if (!updateToLocation(Loc))
5387 return;
5388
5389 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
5390 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
5391 Builder.restoreIP(AllocaIP);
5392 AllocaInst *ArgsBase = Builder.CreateAlloca(
5393 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
5394 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
5395 ".offload_ptrs");
5396 AllocaInst *ArgSizes = Builder.CreateAlloca(
5397 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
5398 Builder.restoreIP(Loc.IP);
5399 MapperAllocas.ArgsBase = ArgsBase;
5400 MapperAllocas.Args = Args;
5401 MapperAllocas.ArgSizes = ArgSizes;
5402}
5403
5405 Function *MapperFunc, Value *SrcLocInfo,
5406 Value *MaptypesArg, Value *MapnamesArg,
5408 int64_t DeviceID, unsigned NumOperands) {
5409 if (!updateToLocation(Loc))
5410 return;
5411
5412 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
5413 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
5414 Value *ArgsBaseGEP =
5416 {Builder.getInt32(0), Builder.getInt32(0)});
5417 Value *ArgsGEP =
5419 {Builder.getInt32(0), Builder.getInt32(0)});
5420 Value *ArgSizesGEP =
5422 {Builder.getInt32(0), Builder.getInt32(0)});
5423 Value *NullPtr =
5424 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
5425 Builder.CreateCall(MapperFunc,
5426 {SrcLocInfo, Builder.getInt64(DeviceID),
5427 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
5428 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
5429}
5430
5432 TargetDataRTArgs &RTArgs,
5433 TargetDataInfo &Info,
5434 bool EmitDebug,
5435 bool ForEndCall) {
5436 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
5437 "expected region end call to runtime only when end call is separate");
5438 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
5439 auto VoidPtrTy = UnqualPtrTy;
5440 auto VoidPtrPtrTy = UnqualPtrTy;
5441 auto Int64Ty = Type::getInt64Ty(M.getContext());
5442 auto Int64PtrTy = UnqualPtrTy;
5443
5444 if (!Info.NumberOfPtrs) {
5445 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5446 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5447 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
5448 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
5449 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
5450 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5451 return;
5452 }
5453
5455 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
5456 Info.RTArgs.BasePointersArray,
5457 /*Idx0=*/0, /*Idx1=*/0);
5459 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
5460 /*Idx0=*/0,
5461 /*Idx1=*/0);
5463 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
5464 /*Idx0=*/0, /*Idx1=*/0);
5466 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
5467 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
5468 : Info.RTArgs.MapTypesArray,
5469 /*Idx0=*/0,
5470 /*Idx1=*/0);
5471
5472 // Only emit the mapper information arrays if debug information is
5473 // requested.
5474 if (!EmitDebug)
5475 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
5476 else
5478 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
5479 /*Idx0=*/0,
5480 /*Idx1=*/0);
5481 // If there is no user-defined mapper, set the mapper array to nullptr to
5482 // avoid an unnecessary data privatization
5483 if (!Info.HasMapper)
5484 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5485 else
5486 RTArgs.MappersArray =
5487 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
5488}
5489
5491 InsertPointTy CodeGenIP,
5492 MapInfosTy &CombinedInfo,
5493 TargetDataInfo &Info) {
5495 CombinedInfo.NonContigInfo;
5496
5497 // Build an array of struct descriptor_dim and then assign it to
5498 // offload_args.
5499 //
5500 // struct descriptor_dim {
5501 // uint64_t offset;
5502 // uint64_t count;
5503 // uint64_t stride
5504 // };
5505 Type *Int64Ty = Builder.getInt64Ty();
5507 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
5508 "struct.descriptor_dim");
5509
5510 enum { OffsetFD = 0, CountFD, StrideFD };
5511 // We need two index variable here since the size of "Dims" is the same as
5512 // the size of Components, however, the size of offset, count, and stride is
5513 // equal to the size of base declaration that is non-contiguous.
5514 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
5515 // Skip emitting ir if dimension size is 1 since it cannot be
5516 // non-contiguous.
5517 if (NonContigInfo.Dims[I] == 1)
5518 continue;
5519 Builder.restoreIP(AllocaIP);
5520 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
5521 AllocaInst *DimsAddr =
5522 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
5523 Builder.restoreIP(CodeGenIP);
5524 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
5525 unsigned RevIdx = EE - II - 1;
5526 Value *DimsLVal = Builder.CreateInBoundsGEP(
5527 DimsAddr->getAllocatedType(), DimsAddr,
5528 {Builder.getInt64(0), Builder.getInt64(II)});
5529 // Offset
5530 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
5532 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
5533 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
5534 // Count
5535 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
5537 NonContigInfo.Counts[L][RevIdx], CountLVal,
5538 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
5539 // Stride
5540 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
5542 NonContigInfo.Strides[L][RevIdx], StrideLVal,
5543 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
5544 }
5545 // args[I] = &dims
5546 Builder.restoreIP(CodeGenIP);
5548 DimsAddr, Builder.getPtrTy());
5550 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
5551 Info.RTArgs.PointersArray, 0, I);
5554 ++L;
5555 }
5556}
5557
5559 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
5560 TargetDataInfo &Info, bool IsNonContiguous,
5561 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
5562 function_ref<Value *(unsigned int)> CustomMapperCB) {
5563
5564 // Reset the array information.
5565 Info.clearArrayInfo();
5566 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
5567
5568 if (Info.NumberOfPtrs == 0)
5569 return;
5570
5571 Builder.restoreIP(AllocaIP);
5572 // Detect if we have any capture size requiring runtime evaluation of the
5573 // size so that a constant array could be eventually used.
5574 ArrayType *PointerArrayType =
5575 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
5576
5577 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
5578 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
5579
5580 Info.RTArgs.PointersArray = Builder.CreateAlloca(
5581 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
5582 AllocaInst *MappersArray = Builder.CreateAlloca(
5583 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
5584 Info.RTArgs.MappersArray = MappersArray;
5585
5586 // If we don't have any VLA types or other types that require runtime
5587 // evaluation, we can use a constant array for the map sizes, otherwise we
5588 // need to fill up the arrays as we do for the pointers.
5589 Type *Int64Ty = Builder.getInt64Ty();
5590 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
5591 ConstantInt::get(Int64Ty, 0));
5592 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
5593 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
5594 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
5595 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
5596 if (IsNonContiguous &&
5597 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5598 CombinedInfo.Types[I] &
5599 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
5600 ConstSizes[I] =
5601 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
5602 else
5603 ConstSizes[I] = CI;
5604 continue;
5605 }
5606 }
5607 RuntimeSizes.set(I);
5608 }
5609
5610 if (RuntimeSizes.all()) {
5611 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
5612 Info.RTArgs.SizesArray = Builder.CreateAlloca(
5613 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
5614 Builder.restoreIP(CodeGenIP);
5615 } else {
5616 auto *SizesArrayInit = ConstantArray::get(
5617 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
5618 std::string Name = createPlatformSpecificName({"offload_sizes"});
5619 auto *SizesArrayGbl =
5620 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
5621 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
5622 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
5623
5624 if (!RuntimeSizes.any()) {
5625 Info.RTArgs.SizesArray = SizesArrayGbl;
5626 } else {
5627 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
5628 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
5629 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
5631 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
5632 Buffer->setAlignment(OffloadSizeAlign);
5633 Builder.restoreIP(CodeGenIP);
5635 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
5636 SizesArrayGbl, OffloadSizeAlign,
5638 IndexSize,
5639 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
5640
5641 Info.RTArgs.SizesArray = Buffer;
5642 }
5643 Builder.restoreIP(CodeGenIP);
5644 }
5645
5646 // The map types are always constant so we don't need to generate code to
5647 // fill arrays. Instead, we create an array constant.
5649 for (auto mapFlag : CombinedInfo.Types)
5650 Mapping.push_back(
5651 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5652 mapFlag));
5653 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
5654 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
5655 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
5656
5657 // The information types are only built if provided.
5658 if (!CombinedInfo.Names.empty()) {
5659 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
5660 auto *MapNamesArrayGbl =
5661 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
5662 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
5663 } else {
5664 Info.RTArgs.MapNamesArray =
5666 }
5667
5668 // If there's a present map type modifier, it must not be applied to the end
5669 // of a region, so generate a separate map type array in that case.
5670 if (Info.separateBeginEndCalls()) {
5671 bool EndMapTypesDiffer = false;
5672 for (uint64_t &Type : Mapping) {
5673 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5674 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
5675 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5676 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
5677 EndMapTypesDiffer = true;
5678 }
5679 }
5680 if (EndMapTypesDiffer) {
5681 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
5682 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
5683 }
5684 }
5685
5686 PointerType *PtrTy = Builder.getPtrTy();
5687 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
5688 Value *BPVal = CombinedInfo.BasePointers[I];
5690 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
5691 0, I);
5692 Builder.CreateAlignedStore(BPVal, BP,
5694
5695 if (Info.requiresDevicePointerInfo()) {
5696 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
5697 CodeGenIP = Builder.saveIP();
5698 Builder.restoreIP(AllocaIP);
5699 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
5700 Builder.restoreIP(CodeGenIP);
5701 if (DeviceAddrCB)
5702 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
5703 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
5704 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
5705 if (DeviceAddrCB)
5706 DeviceAddrCB(I, BP);
5707 }
5708 }
5709
5710 Value *PVal = CombinedInfo.Pointers[I];
5712 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
5713 I);
5714 // TODO: Check alignment correct.
5717
5718 if (RuntimeSizes.test(I)) {
5720 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
5721 /*Idx0=*/0,
5722 /*Idx1=*/I);
5724 Int64Ty,
5725 /*isSigned=*/true),
5726 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
5727 }
5728 // Fill up the mapper array.
5729 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
5730 Value *MFunc = ConstantPointerNull::get(PtrTy);
5731 if (CustomMapperCB)
5732 if (Value *CustomMFunc = CustomMapperCB(I))
5733 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
5735 MappersArray->getAllocatedType(), MappersArray,
5736 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
5738 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
5739 }
5740
5741 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
5742 Info.NumberOfPtrs == 0)
5743 return;
5744 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
5745}
5746
5749
5750 if (!CurBB || CurBB->getTerminator()) {
5751 // If there is no insert point or the previous block is already
5752 // terminated, don't touch it.
5753 } else {
5754 // Otherwise, create a fall-through branch.
5756 }
5757
5759}
5760
5762 bool IsFinished) {
5764
5765 // Fall out of the current block (if necessary).
5766 emitBranch(BB);
5767
5768 if (IsFinished && BB->use_empty()) {
5769 BB->eraseFromParent();
5770 return;
5771 }
5772
5773 // Place the block after the current block, if possible, or else at
5774 // the end of the function.
5775 if (CurBB && CurBB->getParent())
5776 CurFn->insert(std::next(CurBB->getIterator()), BB);
5777 else
5778 CurFn->insert(CurFn->end(), BB);
5780}
5781
5783 BodyGenCallbackTy ElseGen,
5784 InsertPointTy AllocaIP) {
5785 // If the condition constant folds and can be elided, try to avoid emitting
5786 // the condition and the dead arm of the if/else.
5787 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
5788 auto CondConstant = CI->getSExtValue();
5789 if (CondConstant)
5790 ThenGen(AllocaIP, Builder.saveIP());
5791 else
5792 ElseGen(AllocaIP, Builder.saveIP());
5793 return;
5794 }
5795
5797
5798 // Otherwise, the condition did not fold, or we couldn't elide it. Just
5799 // emit the conditional branch.
5800 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
5801 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
5802 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
5803 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
5804 // Emit the 'then' code.
5805 emitBlock(ThenBlock, CurFn);
5806 ThenGen(AllocaIP, Builder.saveIP());
5807 emitBranch(ContBlock);
5808 // Emit the 'else' code if present.
5809 // There is no need to emit line number for unconditional branch.
5810 emitBlock(ElseBlock, CurFn);
5811 ElseGen(AllocaIP, Builder.saveIP());
5812 // There is no need to emit line number for unconditional branch.
5813 emitBranch(ContBlock);
5814 // Emit the continuation block for code after the if.
5815 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
5816}
5817
5818bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
5819 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
5822 "Unexpected Atomic Ordering.");
5823
5824 bool Flush = false;
5826
5827 switch (AK) {
5828 case Read:
5831 FlushAO = AtomicOrdering::Acquire;
5832 Flush = true;
5833 }
5834 break;
5835 case Write:
5836 case Compare:
5837 case Update:
5840 FlushAO = AtomicOrdering::Release;
5841 Flush = true;
5842 }
5843 break;
5844 case Capture:
5845 switch (AO) {
5847 FlushAO = AtomicOrdering::Acquire;
5848 Flush = true;
5849 break;
5851 FlushAO = AtomicOrdering::Release;
5852 Flush = true;
5853 break;
5857 Flush = true;
5858 break;
5859 default:
5860 // do nothing - leave silently.
5861 break;
5862 }
5863 }
5864
5865 if (Flush) {
5866 // Currently Flush RT call still doesn't take memory_ordering, so for when
5867 // that happens, this tries to do the resolution of which atomic ordering
5868 // to use with but issue the flush call
5869 // TODO: pass `FlushAO` after memory ordering support is added
5870 (void)FlushAO;
5871 emitFlush(Loc);
5872 }
5873
5874 // for AO == AtomicOrdering::Monotonic and all other case combinations
5875 // do nothing
5876 return Flush;
5877}
5878
5882 AtomicOrdering AO) {
5883 if (!updateToLocation(Loc))
5884 return Loc.IP;
5885
5886 assert(X.Var->getType()->isPointerTy() &&
5887 "OMP Atomic expects a pointer to target memory");
5888 Type *XElemTy = X.ElemTy;
5889 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5890 XElemTy->isPointerTy()) &&
5891 "OMP atomic read expected a scalar type");
5892
5893 Value *XRead = nullptr;
5894
5895 if (XElemTy->isIntegerTy()) {
5896 LoadInst *XLD =
5897 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
5898 XLD->setAtomic(AO);
5899 XRead = cast<Value>(XLD);
5900 } else {
5901 // We need to perform atomic op as integer
5902 IntegerType *IntCastTy =
5904 LoadInst *XLoad =
5905 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
5906 XLoad->setAtomic(AO);
5907 if (XElemTy->isFloatingPointTy()) {
5908 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
5909 } else {
5910 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
5911 }
5912 }
5913 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
5914 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
5915 return Builder.saveIP();
5916}
5917
5920 AtomicOpValue &X, Value *Expr,
5921 AtomicOrdering AO) {
5922 if (!updateToLocation(Loc))
5923 return Loc.IP;
5924
5925 assert(X.Var->getType()->isPointerTy() &&
5926 "OMP Atomic expects a pointer to target memory");
5927 Type *XElemTy = X.ElemTy;
5928 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5929 XElemTy->isPointerTy()) &&
5930 "OMP atomic write expected a scalar type");
5931
5932 if (XElemTy->isIntegerTy()) {
5933 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
5934 XSt->setAtomic(AO);
5935 } else {
5936 // We need to bitcast and perform atomic op as integers
5937 IntegerType *IntCastTy =
5939 Value *ExprCast =
5940 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
5941 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
5942 XSt->setAtomic(AO);
5943 }
5944
5945 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
5946 return Builder.saveIP();
5947}
5948
5950 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
5951 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
5952 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
5953 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
5954 if (!updateToLocation(Loc))
5955 return Loc.IP;
5956
5957 LLVM_DEBUG({
5958 Type *XTy = X.Var->getType();
5959 assert(XTy->isPointerTy() &&
5960 "OMP Atomic expects a pointer to target memory");
5961 Type *XElemTy = X.ElemTy;
5962 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5963 XElemTy->isPointerTy()) &&
5964 "OMP atomic update expected a scalar type");
5965 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
5966 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
5967 "OpenMP atomic does not support LT or GT operations");
5968 });
5969
5970 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
5971 X.IsVolatile, IsXBinopExpr);
5972 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
5973 return Builder.saveIP();
5974}
5975
5976// FIXME: Duplicating AtomicExpand
5977Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
5978 AtomicRMWInst::BinOp RMWOp) {
5979 switch (RMWOp) {
5980 case AtomicRMWInst::Add:
5981 return Builder.CreateAdd(Src1, Src2);
5982 case AtomicRMWInst::Sub:
5983 return Builder.CreateSub(Src1, Src2);
5984 case AtomicRMWInst::And:
5985 return Builder.CreateAnd(Src1, Src2);
5987 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
5988 case AtomicRMWInst::Or:
5989 return Builder.CreateOr(Src1, Src2);
5990 case AtomicRMWInst::Xor:
5991 return Builder.CreateXor(Src1, Src2);
5996 case AtomicRMWInst::Max:
5997 case AtomicRMWInst::Min:
6004 llvm_unreachable("Unsupported atomic update operation");
6005 }
6006 llvm_unreachable("Unsupported atomic update operation");
6007}
6008
6009std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
6010 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
6012 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
6013 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
6014 // or a complex datatype.
6015 bool emitRMWOp = false;
6016 switch (RMWOp) {
6017 case AtomicRMWInst::Add:
6018 case AtomicRMWInst::And:
6020 case AtomicRMWInst::Or:
6021 case AtomicRMWInst::Xor:
6023 emitRMWOp = XElemTy;
6024 break;
6025 case AtomicRMWInst::Sub:
6026 emitRMWOp = (IsXBinopExpr && XElemTy);
6027 break;
6028 default:
6029 emitRMWOp = false;
6030 }
6031 emitRMWOp &= XElemTy->isIntegerTy();
6032
6033 std::pair<Value *, Value *> Res;
6034 if (emitRMWOp) {
6035 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
6036 // not needed except in case of postfix captures. Generate anyway for
6037 // consistency with the else part. Will be removed with any DCE pass.
6038 // AtomicRMWInst::Xchg does not have a coressponding instruction.
6039 if (RMWOp == AtomicRMWInst::Xchg)
6040 Res.second = Res.first;
6041 else
6042 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
6043 } else {
6044 IntegerType *IntCastTy =
6046 LoadInst *OldVal =
6047 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
6048 OldVal->setAtomic(AO);
6049 // CurBB
6050 // | /---\
6051 // ContBB |
6052 // | \---/
6053 // ExitBB
6055 Instruction *CurBBTI = CurBB->getTerminator();
6056 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
6057 BasicBlock *ExitBB =
6058 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
6059 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
6060 X->getName() + ".atomic.cont");
6061 ContBB->getTerminator()->eraseFromParent();
6062 Builder.restoreIP(AllocaIP);
6063 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
6064 NewAtomicAddr->setName(X->getName() + "x.new.val");
6065 Builder.SetInsertPoint(ContBB);
6066 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
6067 PHI->addIncoming(OldVal, CurBB);
6068 bool IsIntTy = XElemTy->isIntegerTy();
6069 Value *OldExprVal = PHI;
6070 if (!IsIntTy) {
6071 if (XElemTy->isFloatingPointTy()) {
6072 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
6073 X->getName() + ".atomic.fltCast");
6074 } else {
6075 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
6076 X->getName() + ".atomic.ptrCast");
6077 }
6078 }
6079
6080 Value *Upd = UpdateOp(OldExprVal, Builder);
6081 Builder.CreateStore(Upd, NewAtomicAddr);
6082 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
6086 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
6087 Result->setVolatile(VolatileX);
6088 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
6089 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6090 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
6091 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
6092
6093 Res.first = OldExprVal;
6094 Res.second = Upd;
6095
6096 // set Insertion point in exit block
6097 if (UnreachableInst *ExitTI =
6098 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
6099 CurBBTI->eraseFromParent();
6100 Builder.SetInsertPoint(ExitBB);
6101 } else {
6102 Builder.SetInsertPoint(ExitTI);
6103 }
6104 }
6105
6106 return Res;
6107}
6108
6110 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
6111 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
6113 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
6114 if (!updateToLocation(Loc))
6115 return Loc.IP;
6116
6117 LLVM_DEBUG({
6118 Type *XTy = X.Var->getType();
6119 assert(XTy->isPointerTy() &&
6120 "OMP Atomic expects a pointer to target memory");
6121 Type *XElemTy = X.ElemTy;
6122 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
6123 XElemTy->isPointerTy()) &&
6124 "OMP atomic capture expected a scalar type");
6125 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
6126 "OpenMP atomic does not support LT or GT operations");
6127 });
6128
6129 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
6130 // 'x' is simply atomically rewritten with 'expr'.
6131 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
6132 std::pair<Value *, Value *> Result =
6133 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
6134 X.IsVolatile, IsXBinopExpr);
6135
6136 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
6137 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
6138
6139 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
6140 return Builder.saveIP();
6141}
6142
6146 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
6147 bool IsFailOnly) {
6148
6150 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
6151 IsPostfixUpdate, IsFailOnly, Failure);
6152}
6153
6157 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
6158 bool IsFailOnly, AtomicOrdering Failure) {
6159
6160 if (!updateToLocation(Loc))
6161 return Loc.IP;
6162
6163 assert(X.Var->getType()->isPointerTy() &&
6164 "OMP atomic expects a pointer to target memory");
6165 // compare capture
6166 if (V.Var) {
6167 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
6168 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
6169 }
6170
6171 bool IsInteger = E->getType()->isIntegerTy();
6172
6173 if (Op == OMPAtomicCompareOp::EQ) {
6174 AtomicCmpXchgInst *Result = nullptr;
6175 if (!IsInteger) {
6176 IntegerType *IntCastTy =
6177 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
6178 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
6179 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
6180 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
6181 AO, Failure);
6182 } else {
6183 Result =
6184 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
6185 }
6186
6187 if (V.Var) {
6188 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
6189 if (!IsInteger)
6190 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
6191 assert(OldValue->getType() == V.ElemTy &&
6192 "OldValue and V must be of same type");
6193 if (IsPostfixUpdate) {
6194 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
6195 } else {
6196 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6197 if (IsFailOnly) {
6198 // CurBB----
6199 // | |
6200 // v |
6201 // ContBB |
6202 // | |
6203 // v |
6204 // ExitBB <-
6205 //
6206 // where ContBB only contains the store of old value to 'v'.
6208 Instruction *CurBBTI = CurBB->getTerminator();
6209 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
6210 BasicBlock *ExitBB = CurBB->splitBasicBlock(
6211 CurBBTI, X.Var->getName() + ".atomic.exit");
6212 BasicBlock *ContBB = CurBB->splitBasicBlock(
6213 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
6214 ContBB->getTerminator()->eraseFromParent();
6215 CurBB->getTerminator()->eraseFromParent();
6216
6217 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
6218
6219 Builder.SetInsertPoint(ContBB);
6220 Builder.CreateStore(OldValue, V.Var);
6221 Builder.CreateBr(ExitBB);
6222
6223 if (UnreachableInst *ExitTI =
6224 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
6225 CurBBTI->eraseFromParent();
6226 Builder.SetInsertPoint(ExitBB);
6227 } else {
6228 Builder.SetInsertPoint(ExitTI);
6229 }
6230 } else {
6231 Value *CapturedValue =
6232 Builder.CreateSelect(SuccessOrFail, E, OldValue);
6233 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
6234 }
6235 }
6236 }
6237 // The comparison result has to be stored.
6238 if (R.Var) {
6239 assert(R.Var->getType()->isPointerTy() &&
6240 "r.var must be of pointer type");
6241 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
6242
6243 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6244 Value *ResultCast = R.IsSigned
6245 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
6246 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
6247 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
6248 }
6249 } else {
6250 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
6251 "Op should be either max or min at this point");
6252 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
6253
6254 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
6255 // Let's take max as example.
6256 // OpenMP form:
6257 // x = x > expr ? expr : x;
6258 // LLVM form:
6259 // *ptr = *ptr > val ? *ptr : val;
6260 // We need to transform to LLVM form.
6261 // x = x <= expr ? x : expr;
6263 if (IsXBinopExpr) {
6264 if (IsInteger) {
6265 if (X.IsSigned)
6266 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
6268 else
6269 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
6271 } else {
6272 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
6274 }
6275 } else {
6276 if (IsInteger) {
6277 if (X.IsSigned)
6278 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
6280 else
6281 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
6283 } else {
6284 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
6286 }
6287 }
6288
6289 AtomicRMWInst *OldValue =
6290 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
6291 if (V.Var) {
6292 Value *CapturedValue = nullptr;
6293 if (IsPostfixUpdate) {
6294 CapturedValue = OldValue;
6295 } else {
6296 CmpInst::Predicate Pred;
6297 switch (NewOp) {
6298 case AtomicRMWInst::Max:
6299 Pred = CmpInst::ICMP_SGT;
6300 break;
6302 Pred = CmpInst::ICMP_UGT;
6303 break;
6305 Pred = CmpInst::FCMP_OGT;
6306 break;
6307 case AtomicRMWInst::Min:
6308 Pred = CmpInst::ICMP_SLT;
6309 break;
6311 Pred = CmpInst::ICMP_ULT;
6312 break;
6314 Pred = CmpInst::FCMP_OLT;
6315 break;
6316 default:
6317 llvm_unreachable("unexpected comparison op");
6318 }
6319 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
6320 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
6321 }
6322 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
6323 }
6324 }
6325
6326 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
6327
6328 return Builder.saveIP();
6329}
6330
6333 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
6334 Value *NumTeamsUpper, Value *ThreadLimit,
6335 Value *IfExpr) {
6336 if (!updateToLocation(Loc))
6337 return InsertPointTy();
6338
6339 uint32_t SrcLocStrSize;
6340 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6341 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6342 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
6343
6344 // Outer allocation basicblock is the entry block of the current function.
6345 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
6346 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
6347 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
6348 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
6349 }
6350
6351 // The current basic block is split into four basic blocks. After outlining,
6352 // they will be mapped as follows:
6353 // ```
6354 // def current_fn() {
6355 // current_basic_block:
6356 // br label %teams.exit
6357 // teams.exit:
6358 // ; instructions after teams
6359 // }
6360 //
6361 // def outlined_fn() {
6362 // teams.alloca:
6363 // br label %teams.body
6364 // teams.body:
6365 // ; instructions within teams body
6366 // }
6367 // ```
6368 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
6369 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
6370 BasicBlock *AllocaBB =
6371 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
6372
6373 bool SubClausesPresent =
6374 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
6375 // Push num_teams
6376 if (!Config.isTargetDevice() && SubClausesPresent) {
6377 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
6378 "if lowerbound is non-null, then upperbound must also be non-null "
6379 "for bounds on num_teams");
6380
6381 if (NumTeamsUpper == nullptr)
6382 NumTeamsUpper = Builder.getInt32(0);
6383
6384 if (NumTeamsLower == nullptr)
6385 NumTeamsLower = NumTeamsUpper;
6386
6387 if (IfExpr) {
6388 assert(IfExpr->getType()->isIntegerTy() &&
6389 "argument to if clause must be an integer value");
6390
6391 // upper = ifexpr ? upper : 1
6392 if (IfExpr->getType() != Int1)
6393 IfExpr = Builder.CreateICmpNE(IfExpr,
6394 ConstantInt::get(IfExpr->getType(), 0));
6395 NumTeamsUpper = Builder.CreateSelect(
6396 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
6397
6398 // lower = ifexpr ? lower : 1
6399 NumTeamsLower = Builder.CreateSelect(
6400 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
6401 }
6402
6403 if (ThreadLimit == nullptr)
6404 ThreadLimit = Builder.getInt32(0);
6405
6406 Value *ThreadNum = getOrCreateThreadID(Ident);
6408 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
6409 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
6410 }
6411 // Generate the body of teams.
6412 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
6413 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
6414 BodyGenCB(AllocaIP, CodeGenIP);
6415
6416 OutlineInfo OI;
6417 OI.EntryBB = AllocaBB;
6418 OI.ExitBB = ExitBB;
6419 OI.OuterAllocaBB = &OuterAllocaBB;
6420
6421 // Insert fake values for global tid and bound tid.
6422 std::stack<Instruction *> ToBeDeleted;
6423 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
6425 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
6427 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
6428
6429 auto HostPostOutlineCB = [this, Ident,
6430 ToBeDeleted](Function &OutlinedFn) mutable {
6431 // The stale call instruction will be replaced with a new call instruction
6432 // for runtime call with the outlined function.
6433
6434 assert(OutlinedFn.getNumUses() == 1 &&
6435 "there must be a single user for the outlined function");
6436 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
6437 ToBeDeleted.push(StaleCI);
6438
6439 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
6440 "Outlined function must have two or three arguments only");
6441
6442 bool HasShared = OutlinedFn.arg_size() == 3;
6443
6444 OutlinedFn.getArg(0)->setName("global.tid.ptr");
6445 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
6446 if (HasShared)
6447 OutlinedFn.getArg(2)->setName("data");
6448
6449 // Call to the runtime function for teams in the current function.
6450 assert(StaleCI && "Error while outlining - no CallInst user found for the "
6451 "outlined function.");
6452 Builder.SetInsertPoint(StaleCI);
6453 SmallVector<Value *> Args = {
6454 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
6455 if (HasShared)
6456 Args.push_back(StaleCI->getArgOperand(2));
6458 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
6459 Args);
6460
6461 while (!ToBeDeleted.empty()) {
6462 ToBeDeleted.top()->eraseFromParent();
6463 ToBeDeleted.pop();
6464 }
6465 };
6466
6467 if (!Config.isTargetDevice())
6468 OI.PostOutlineCB = HostPostOutlineCB;
6469
6470 addOutlineInfo(std::move(OI));
6471
6472 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
6473
6474 return Builder.saveIP();
6475}
6476
6479 std::string VarName) {
6480 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
6482 Names.size()),
6483 Names);
6484 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
6485 M, MapNamesArrayInit->getType(),
6486 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
6487 VarName);
6488 return MapNamesArrayGlobal;
6489}
6490
6491// Create all simple and struct types exposed by the runtime and remember
6492// the llvm::PointerTypes of them for easy access later.
6493void OpenMPIRBuilder::initializeTypes(Module &M) {
6494 LLVMContext &Ctx = M.getContext();
6495 StructType *T;
6496#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
6497#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
6498 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
6499 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
6500#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
6501 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
6502 VarName##Ptr = PointerType::getUnqual(VarName);
6503#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
6504 T = StructType::getTypeByName(Ctx, StructName); \
6505 if (!T) \
6506 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
6507 VarName = T; \
6508 VarName##Ptr = PointerType::getUnqual(T);
6509#include "llvm/Frontend/OpenMP/OMPKinds.def"
6510}
6511
6514 SmallVectorImpl<BasicBlock *> &BlockVector) {
6516 BlockSet.insert(EntryBB);
6517 BlockSet.insert(ExitBB);
6518
6519 Worklist.push_back(EntryBB);
6520 while (!Worklist.empty()) {
6521 BasicBlock *BB = Worklist.pop_back_val();
6522 BlockVector.push_back(BB);
6523 for (BasicBlock *SuccBB : successors(BB))
6524 if (BlockSet.insert(SuccBB).second)
6525 Worklist.push_back(SuccBB);
6526 }
6527}
6528
6530 uint64_t Size, int32_t Flags,
6532 StringRef Name) {
6533 if (!Config.isGPU()) {
6535 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
6536 "omp_offloading_entries");
6537 return;
6538 }
6539 // TODO: Add support for global variables on the device after declare target
6540 // support.
6541 Function *Fn = dyn_cast<Function>(Addr);
6542 if (!Fn)
6543 return;
6544
6545 Module &M = *(Fn->getParent());
6546 LLVMContext &Ctx = M.getContext();
6547
6548 // Get "nvvm.annotations" metadata node.
6549 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6550
6551 Metadata *MDVals[] = {
6552 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
6553 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
6554 // Append metadata to nvvm.annotations.
6555 MD->addOperand(MDNode::get(Ctx, MDVals));
6556
6557 // Add a function attribute for the kernel.
6558 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
6559 if (T.isAMDGCN())
6560 Fn->addFnAttr("uniform-work-group-size", "true");
6561 Fn->addFnAttr(Attribute::MustProgress);
6562}
6563
6564// We only generate metadata for function that contain target regions.
6567
6568 // If there are no entries, we don't need to do anything.
6570 return;
6571
6575 16>
6576 OrderedEntries(OffloadInfoManager.size());
6577
6578 // Auxiliary methods to create metadata values and strings.
6579 auto &&GetMDInt = [this](unsigned V) {
6580 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
6581 };
6582
6583 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
6584
6585 // Create the offloading info metadata node.
6586 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
6587 auto &&TargetRegionMetadataEmitter =
6588 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
6589 const TargetRegionEntryInfo &EntryInfo,
6591 // Generate metadata for target regions. Each entry of this metadata
6592 // contains:
6593 // - Entry 0 -> Kind of this type of metadata (0).
6594 // - Entry 1 -> Device ID of the file where the entry was identified.
6595 // - Entry 2 -> File ID of the file where the entry was identified.
6596 // - Entry 3 -> Mangled name of the function where the entry was
6597 // identified.
6598 // - Entry 4 -> Line in the file where the entry was identified.
6599 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
6600 // - Entry 6 -> Order the entry was created.
6601 // The first element of the metadata node is the kind.
6602 Metadata *Ops[] = {
6603 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
6604 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
6605 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
6606 GetMDInt(E.getOrder())};
6607
6608 // Save this entry in the right position of the ordered entries array.
6609 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
6610
6611 // Add metadata to the named metadata node.
6612 MD->addOperand(MDNode::get(C, Ops));
6613 };
6614
6615 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
6616
6617 // Create function that emits metadata for each device global variable entry;
6618 auto &&DeviceGlobalVarMetadataEmitter =
6619 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
6620 StringRef MangledName,
6622 // Generate metadata for global variables. Each entry of this metadata
6623 // contains:
6624 // - Entry 0 -> Kind of this type of metadata (1).
6625 // - Entry 1 -> Mangled name of the variable.
6626 // - Entry 2 -> Declare target kind.
6627 // - Entry 3 -> Order the entry was created.
6628 // The first element of the metadata node is the kind.
6629 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
6630 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
6631
6632 // Save this entry in the right position of the ordered entries array.
6633 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
6634 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
6635
6636 // Add metadata to the named metadata node.
6637 MD->addOperand(MDNode::get(C, Ops));
6638 };
6639
6641 DeviceGlobalVarMetadataEmitter);
6642
6643 for (const auto &E : OrderedEntries) {
6644 assert(E.first && "All ordered entries must exist!");
6645 if (const auto *CE =
6646 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
6647 E.first)) {
6648 if (!CE->getID() || !CE->getAddress()) {
6649 // Do not blame the entry if the parent funtion is not emitted.
6650 TargetRegionEntryInfo EntryInfo = E.second;
6651 StringRef FnName = EntryInfo.ParentName;
6652 if (!M.getNamedValue(FnName))
6653 continue;
6654 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
6655 continue;
6656 }
6657 createOffloadEntry(CE->getID(), CE->getAddress(),
6658 /*Size=*/0, CE->getFlags(),
6660 } else if (const auto *CE = dyn_cast<
6662 E.first)) {
6665 CE->getFlags());
6666 switch (Flags) {
6670 continue;
6671 if (!CE->getAddress()) {
6672 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
6673 continue;
6674 }
6675 // The vaiable has no definition - no need to add the entry.
6676 if (CE->getVarSize() == 0)
6677 continue;
6678 break;
6680 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
6681 (!Config.isTargetDevice() && CE->getAddress())) &&
6682 "Declaret target link address is set.");
6683 if (Config.isTargetDevice())
6684 continue;
6685 if (!CE->getAddress()) {
6687 continue;
6688 }
6689 break;
6690 default:
6691 break;
6692 }
6693
6694 // Hidden or internal symbols on the device are not externally visible.
6695 // We should not attempt to register them by creating an offloading
6696 // entry. Indirect variables are handled separately on the device.
6697 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
6698 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
6700 continue;
6701
6702 // Indirect globals need to use a special name that doesn't match the name
6703 // of the associated host global.
6705 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
6706 Flags, CE->getLinkage(), CE->getVarName());
6707 else
6708 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
6709 Flags, CE->getLinkage());
6710
6711 } else {
6712 llvm_unreachable("Unsupported entry kind.");
6713 }
6714 }
6715
6716 // Emit requires directive globals to a special entry so the runtime can
6717 // register them when the device image is loaded.
6718 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
6719 // entries should be redesigned to better suit this use-case.
6723 /*Name=*/"",
6725 Config.getRequiresFlags(), "omp_offloading_entries");
6726}
6727
6729 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
6730 unsigned FileID, unsigned Line, unsigned Count) {
6732 OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
6733 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
6734 if (Count)
6735 OS << "_" << Count;
6736}
6737
6740 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
6742 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
6743 EntryInfo.Line, NewCount);
6744}
6745
6748 StringRef ParentName) {
6750 auto FileIDInfo = CallBack();
6751 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
6752 report_fatal_error(("Unable to get unique ID for file, during "
6753 "getTargetEntryUniqueInfo, error message: " +
6754 EC.message())
6755 .c_str());
6756 }
6757
6758 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
6759 std::get<1>(FileIDInfo));
6760}
6761
6763 unsigned Offset = 0;
6764 for (uint64_t Remain =
6765 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6767 !(Remain & 1); Remain = Remain >> 1)
6768 Offset++;
6769 return Offset;
6770}
6771
6774 // Rotate by getFlagMemberOffset() bits.
6775 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
6776 << getFlagMemberOffset());
6777}
6778
6781 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
6782 // If the entry is PTR_AND_OBJ but has not been marked with the special
6783 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
6784 // marked as MEMBER_OF.
6785 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6787 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6790 return;
6791
6792 // Reset the placeholder value to prepare the flag for the assignment of the
6793 // proper MEMBER_OF value.
6794 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
6795 Flags |= MemberOfFlag;
6796}
6797
6801 bool IsDeclaration, bool IsExternallyVisible,
6802 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
6803 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
6804 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
6805 std::function<Constant *()> GlobalInitializer,
6806 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
6807 // TODO: convert this to utilise the IRBuilder Config rather than
6808 // a passed down argument.
6809 if (OpenMPSIMD)
6810 return nullptr;
6811
6814 CaptureClause ==
6817 SmallString<64> PtrName;
6818 {
6819 raw_svector_ostream OS(PtrName);
6820 OS << MangledName;
6821 if (!IsExternallyVisible)
6822 OS << format("_%x", EntryInfo.FileID);
6823 OS << "_decl_tgt_ref_ptr";
6824 }
6825
6826 Value *Ptr = M.getNamedValue(PtrName);
6827
6828 if (!Ptr) {
6829 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
6830 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
6831
6832 auto *GV = cast<GlobalVariable>(Ptr);
6833 GV->setLinkage(GlobalValue::WeakAnyLinkage);
6834
6835 if (!Config.isTargetDevice()) {
6836 if (GlobalInitializer)
6837 GV->setInitializer(GlobalInitializer());
6838 else
6839 GV->setInitializer(GlobalValue);
6840 }
6841
6843 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
6844 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
6845 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
6846 }
6847
6848 return cast<Constant>(Ptr);
6849 }
6850
6851 return nullptr;
6852}
6853
6857 bool IsDeclaration, bool IsExternallyVisible,
6858 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
6859 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
6860 std::vector<Triple> TargetTriple,
6861 std::function<Constant *()> GlobalInitializer,
6862 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
6863 Constant *Addr) {
6865 (TargetTriple.empty() && !Config.isTargetDevice()))
6866 return;
6867
6869 StringRef VarName;
6870 int64_t VarSize;
6872
6874 CaptureClause ==
6878 VarName = MangledName;
6879 GlobalValue *LlvmVal = M.getNamedValue(VarName);
6880
6881 if (!IsDeclaration)
6882 VarSize = divideCeil(
6884 else
6885 VarSize = 0;
6886 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
6887
6888 // This is a workaround carried over from Clang which prevents undesired
6889 // optimisation of internal variables.
6890 if (Config.isTargetDevice() &&
6891 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
6892 // Do not create a "ref-variable" if the original is not also available
6893 // on the host.
6895 return;
6896
6897 std::string RefName = createPlatformSpecificName({VarName, "ref"});
6898
6899 if (!M.getNamedValue(RefName)) {
6900 Constant *AddrRef =
6901 getOrCreateInternalVariable(Addr->getType(), RefName);
6902 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
6903 GvAddrRef->setConstant(true);
6904 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
6905 GvAddrRef->setInitializer(Addr);
6906 GeneratedRefs.push_back(GvAddrRef);
6907 }
6908 }
6909 } else {
6912 else
6914
6915 if (Config.isTargetDevice()) {
6916 VarName = (Addr) ? Addr->getName() : "";
6917 Addr = nullptr;
6918 } else {
6920 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
6921 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
6922 LlvmPtrTy, GlobalInitializer, VariableLinkage);
6923 VarName = (Addr) ? Addr->getName() : "";
6924 }
6925 VarSize = M.getDataLayout().getPointerSize();
6927 }
6928
6930 Flags, Linkage);
6931}
6932
6933/// Loads all the offload entries information from the host IR
6934/// metadata.
6936 // If we are in target mode, load the metadata from the host IR. This code has
6937 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
6938
6940 if (!MD)
6941 return;
6942
6943 for (MDNode *MN : MD->operands()) {
6944 auto &&GetMDInt = [MN](unsigned Idx) {
6945 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
6946 return cast<ConstantInt>(V->getValue())->getZExtValue();
6947 };
6948
6949 auto &&GetMDString = [MN](unsigned Idx) {
6950 auto *V = cast<MDString>(MN->getOperand(Idx));
6951 return V->getString();
6952 };
6953
6954 switch (GetMDInt(0)) {
6955 default:
6956 llvm_unreachable("Unexpected metadata!");
6957 break;
6960 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
6961 /*DeviceID=*/GetMDInt(1),
6962 /*FileID=*/GetMDInt(2),
6963 /*Line=*/GetMDInt(4),
6964 /*Count=*/GetMDInt(5));
6966 /*Order=*/GetMDInt(6));
6967 break;
6968 }
6972 /*MangledName=*/GetMDString(1),
6974 /*Flags=*/GetMDInt(2)),
6975 /*Order=*/GetMDInt(3));
6976 break;
6977 }
6978 }
6979}
6980
6982 if (HostFilePath.empty())
6983 return;
6984
6985 auto Buf = MemoryBuffer::getFile(HostFilePath);
6986 if (std::error_code Err = Buf.getError()) {
6987 report_fatal_error(("error opening host file from host file path inside of "
6988 "OpenMPIRBuilder: " +
6989 Err.message())
6990 .c_str());
6991 }
6992
6993 LLVMContext Ctx;
6995 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
6996 if (std::error_code Err = M.getError()) {
6998 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
6999 .c_str());
7000 }
7001
7002 loadOffloadInfoMetadata(*M.get());
7003}
7004
7005//===----------------------------------------------------------------------===//
7006// OffloadEntriesInfoManager
7007//===----------------------------------------------------------------------===//
7008
7010 return OffloadEntriesTargetRegion.empty() &&
7011 OffloadEntriesDeviceGlobalVar.empty();
7012}
7013
7014unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
7015 const TargetRegionEntryInfo &EntryInfo) const {
7016 auto It = OffloadEntriesTargetRegionCount.find(
7017 getTargetRegionEntryCountKey(EntryInfo));
7018 if (It == OffloadEntriesTargetRegionCount.end())
7019 return 0;
7020 return It->second;
7021}
7022
7023void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
7024 const TargetRegionEntryInfo &EntryInfo) {
7025 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
7026 EntryInfo.Count + 1;
7027}
7028
7029/// Initialize target region entry.
7031 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
7032 OffloadEntriesTargetRegion[EntryInfo] =
7033 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
7034 OMPTargetRegionEntryTargetRegion);
7035 ++OffloadingEntriesNum;
7036}
7037
7041 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
7042
7043 // Update the EntryInfo with the next available count for this location.
7044 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
7045
7046 // If we are emitting code for a target, the entry is already initialized,
7047 // only has to be registered.
7048 if (OMPBuilder->Config.isTargetDevice()) {
7049 // This could happen if the device compilation is invoked standalone.
7050 if (!hasTargetRegionEntryInfo(EntryInfo)) {
7051 return;
7052 }
7053 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
7054 Entry.setAddress(Addr);
7055 Entry.setID(ID);
7056 Entry.setFlags(Flags);
7057 } else {
7059 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
7060 return;
7061 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
7062 "Target region entry already registered!");
7063 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
7064 OffloadEntriesTargetRegion[EntryInfo] = Entry;
7065 ++OffloadingEntriesNum;
7066 }
7067 incrementTargetRegionEntryInfoCount(EntryInfo);
7068}
7069
7071 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
7072
7073 // Update the EntryInfo with the next available count for this location.
7074 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
7075
7076 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
7077 if (It == OffloadEntriesTargetRegion.end()) {
7078 return false;
7079 }
7080 // Fail if this entry is already registered.
7081 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
7082 return false;
7083 return true;
7084}
7085
7087 const OffloadTargetRegionEntryInfoActTy &Action) {
7088 // Scan all target region entries and perform the provided action.
7089 for (const auto &It : OffloadEntriesTargetRegion) {
7090 Action(It.first, It.second);
7091 }
7092}
7093
7095 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
7096 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
7097 ++OffloadingEntriesNum;
7098}
7099
7101 StringRef VarName, Constant *Addr, int64_t VarSize,
7103 if (OMPBuilder->Config.isTargetDevice()) {
7104 // This could happen if the device compilation is invoked standalone.
7105 if (!hasDeviceGlobalVarEntryInfo(VarName))
7106 return;
7107 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
7108 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
7109 if (Entry.getVarSize() == 0) {
7110 Entry.setVarSize(VarSize);
7111 Entry.setLinkage(Linkage);
7112 }
7113 return;
7114 }
7115 Entry.setVarSize(VarSize);
7116 Entry.setLinkage(Linkage);
7117 Entry.setAddress(Addr);
7118 } else {
7119 if (hasDeviceGlobalVarEntryInfo(VarName)) {
7120 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
7121 assert(Entry.isValid() && Entry.getFlags() == Flags &&
7122 "Entry not initialized!");
7123 if (Entry.getVarSize() == 0) {
7124 Entry.setVarSize(VarSize);
7125 Entry.setLinkage(Linkage);
7126 }
7127 return;
7128 }
7130 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
7131 Addr, VarSize, Flags, Linkage,
7132 VarName.str());
7133 else
7134 OffloadEntriesDeviceGlobalVar.try_emplace(
7135 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
7136 ++OffloadingEntriesNum;
7137 }
7138}
7139
7142 // Scan all target region entries and perform the provided action.
7143 for (const auto &E : OffloadEntriesDeviceGlobalVar)
7144 Action(E.getKey(), E.getValue());
7145}
7146
7147//===----------------------------------------------------------------------===//
7148// CanonicalLoopInfo
7149//===----------------------------------------------------------------------===//
7150
7151void CanonicalLoopInfo::collectControlBlocks(
7153 // We only count those BBs as control block for which we do not need to
7154 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
7155 // flow. For consistency, this also means we do not add the Body block, which
7156 // is just the entry to the body code.
7157 BBs.reserve(BBs.size() + 6);
7158 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
7159}
7160
7162 assert(isValid() && "Requires a valid canonical loop");
7163 for (BasicBlock *Pred : predecessors(Header)) {
7164 if (Pred != Latch)
7165 return Pred;
7166 }
7167 llvm_unreachable("Missing preheader");
7168}
7169
7170void CanonicalLoopInfo::setTripCount(Value *TripCount) {
7171 assert(isValid() && "Requires a valid canonical loop");
7172
7173 Instruction *CmpI = &getCond()->front();
7174 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
7175 CmpI->setOperand(1, TripCount);
7176
7177#ifndef NDEBUG
7178 assertOK();
7179#endif
7180}
7181
7182void CanonicalLoopInfo::mapIndVar(
7183 llvm::function_ref<Value *(Instruction *)> Updater) {
7184 assert(isValid() && "Requires a valid canonical loop");
7185
7186 Instruction *OldIV = getIndVar();
7187
7188 // Record all uses excluding those introduced by the updater. Uses by the
7189 // CanonicalLoopInfo itself to keep track of the number of iterations are
7190 // excluded.
7191 SmallVector<Use *> ReplacableUses;
7192 for (Use &U : OldIV->uses()) {
7193 auto *User = dyn_cast<Instruction>(U.getUser());
7194 if (!User)
7195 continue;
7196 if (User->getParent() == getCond())
7197 continue;
7198 if (User->getParent() == getLatch())
7199 continue;
7200 ReplacableUses.push_back(&U);
7201 }
7202
7203 // Run the updater that may introduce new uses
7204 Value *NewIV = Updater(OldIV);
7205
7206 // Replace the old uses with the value returned by the updater.
7207 for (Use *U : ReplacableUses)
7208 U->set(NewIV);
7209
7210#ifndef NDEBUG
7211 assertOK();
7212#endif
7213}
7214
7216#ifndef NDEBUG
7217 // No constraints if this object currently does not describe a loop.
7218 if (!isValid())
7219 return;
7220
7221 BasicBlock *Preheader = getPreheader();
7222 BasicBlock *Body = getBody();
7223 BasicBlock *After = getAfter();
7224
7225 // Verify standard control-flow we use for OpenMP loops.
7226 assert(Preheader);
7227 assert(isa<BranchInst>(Preheader->getTerminator()) &&
7228 "Preheader must terminate with unconditional branch");
7229 assert(Preheader->getSingleSuccessor() == Header &&
7230 "Preheader must jump to header");
7231
7232 assert(Header);
7233 assert(isa<BranchInst>(Header->getTerminator()) &&
7234 "Header must terminate with unconditional branch");
7235 assert(Header->getSingleSuccessor() == Cond &&
7236 "Header must jump to exiting block");
7237
7238 assert(Cond);
7239 assert(Cond->getSinglePredecessor() == Header &&
7240 "Exiting block only reachable from header");
7241
7242 assert(isa<BranchInst>(Cond->getTerminator()) &&
7243 "Exiting block must terminate with conditional branch");
7244 assert(size(successors(Cond)) == 2 &&
7245 "Exiting block must have two successors");
7246 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
7247 "Exiting block's first successor jump to the body");
7248 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
7249 "Exiting block's second successor must exit the loop");
7250
7251 assert(Body);
7252 assert(Body->getSinglePredecessor() == Cond &&
7253 "Body only reachable from exiting block");
7254 assert(!isa<PHINode>(Body->front()));
7255
7256 assert(Latch);
7257 assert(isa<BranchInst>(Latch->getTerminator()) &&
7258 "Latch must terminate with unconditional branch");
7259 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
7260 // TODO: To support simple redirecting of the end of the body code that has
7261 // multiple; introduce another auxiliary basic block like preheader and after.
7262 assert(Latch->getSinglePredecessor() != nullptr);
7263 assert(!isa<PHINode>(Latch->front()));
7264
7265 assert(Exit);
7266 assert(isa<BranchInst>(Exit->getTerminator()) &&
7267 "Exit block must terminate with unconditional branch");
7268 assert(Exit->getSingleSuccessor() == After &&
7269 "Exit block must jump to after block");
7270
7271 assert(After);
7272 assert(After->getSinglePredecessor() == Exit &&
7273 "After block only reachable from exit block");
7274 assert(After->empty() || !isa<PHINode>(After->front()));
7275
7276 Instruction *IndVar = getIndVar();
7277 assert(IndVar && "Canonical induction variable not found?");
7278 assert(isa<IntegerType>(IndVar->getType()) &&
7279 "Induction variable must be an integer");
7280 assert(cast<PHINode>(IndVar)->getParent() == Header &&
7281 "Induction variable must be a PHI in the loop header");
7282 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
7283 assert(
7284 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
7285 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
7286
7287 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
7288 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
7289 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
7290 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
7291 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
7292 ->isOne());
7293
7294 Value *TripCount = getTripCount();
7295 assert(TripCount && "Loop trip count not found?");
7296 assert(IndVar->getType() == TripCount->getType() &&
7297 "Trip count and induction variable must have the same type");
7298
7299 auto *CmpI = cast<CmpInst>(&Cond->front());
7300 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
7301 "Exit condition must be a signed less-than comparison");
7302 assert(CmpI->getOperand(0) == IndVar &&
7303 "Exit condition must compare the induction variable");
7304 assert(CmpI->getOperand(1) == TripCount &&
7305 "Exit condition must compare with the trip count");
7306#endif
7307}
7308
7310 Header = nullptr;
7311 Cond = nullptr;
7312 Latch = nullptr;
7313 Exit = nullptr;
7314}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Rewrite undef for PHI
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
IntegerType * Int32Ty
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input, Function *Func)
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr, Function *Func)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
Value * createFakeIntVal(IRBuilder<> &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, std::stack< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
Function * getFreshReductionFunc(Module &M)
Create a function with a unique name and a "void (i8*, i8*)" signature in the given module and return...
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:59
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:107
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:125
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:112
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:136
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:103
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:535
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
Class to represent array types.
Definition: DerivedTypes.h:371
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:696
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ FSub
*p = old - v
Definition: Instructions.h:788
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:800
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:804
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:797
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:782
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:657
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:409
reverse_iterator rbegin()
Definition: BasicBlock.h:446
bool empty() const
Definition: BasicBlock.h:452
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const Instruction & front() const
Definition: BasicBlock.h:453
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:490
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:452
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:460
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:482
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:276
reverse_iterator rend()
Definition: BasicBlock.h:448
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:379
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:358
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:613
const Instruction & back() const
Definition: BasicBlock.h:455
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:289
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:509
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1662
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1668
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1291
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2881
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:705
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:1017
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2072
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2087
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2152
Instruction * getAsInstruction() const
Returns an Instruction which implements the same operation as this ConstantExpr.
Definition: Constants.cpp:3310
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:123
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1775
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1356
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Debug location.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:294
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:533
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:276
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:750
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
Class to represent function types.
Definition: DerivedTypes.h:103
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:585
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:164
const BasicBlock & getEntryBlock() const
Definition: Function.h:787
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:399
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:713
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
const Function & getFunction() const
Definition: Function.h:162
iterator begin()
Definition: Function.h:803
arg_iterator arg_begin()
Definition: Function.h:818
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:613
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:732
size_t arg_size() const
Definition: Function.h:851
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:207
iterator end()
Definition: Function.h:805
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:268
Argument * getArg(unsigned i) const
Definition: Function.h:836
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1521
LinkageTypes getLinkage() const
Definition: GlobalValue.h:545
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:536
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
void setDSOLocal(bool Local)
Definition: GlobalValue.h:302
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:293
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:67
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:68
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:253
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:50
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:59
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:61
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:58
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:56
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:51
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:55
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:54
Type * getValueType() const
Definition: GlobalValue.h:295
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:257
BasicBlock * getBlock() const
Definition: IRBuilder.h:272
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:270
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:273
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1841
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1773
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
Constant * CreateGlobalStringPtr(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr)
Same as CreateGlobalString, but return a pointer with "i8*" type instead of a pointer to array of i8.
Definition: IRBuilder.h:1993
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2039
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1263
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1307
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1973
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2033
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:531
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1876
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2182
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1378
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2245
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1721
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:277
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1143
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2241
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:63
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:497
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1120
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1475
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1090
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1914
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1960
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1327
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2549
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1854
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1114
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2253
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:289
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2544
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:564
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1519
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2351
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1404
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:659
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2054
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:91
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:926
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:359
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:957
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:117
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
NamedMDNode * getNamedMetadata(const Twine &Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:260
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:284
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:297
iterator_range< global_iterator > globals()
Definition: Module.h:699
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:611
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:446
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:133
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:269
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:461
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
A tuple of MDNodes.
Definition: Metadata.h:1729
iterator_range< op_iterator > operands()
Definition: Metadata.h:1825
void addOperand(MDNode *M)
Definition: Metadata.cpp:1387
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:221
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:223
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:354
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:356
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:274
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:276
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:265
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:334
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:340
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:346
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:344
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:338
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:336
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:410
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:90
StringRef separator() const
Definition: OMPIRBuilder.h:157
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:147
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:129
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:451
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:497
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
InsertPointTy emitBarrierImpl(const LocationDescription &Loc, omp::Directive DK, bool ForceSimpleCall, bool CheckCancelFlag)
Generate a barrier runtime call.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool EmitDebug=false, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsByRef=false)
Generator for '#omp reduction'.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:477
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
InsertPointTy createTarget(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB)
Generator for '#omp target'.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Definition: PassManager.h:296
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
void setAlignment(Align Align)
Definition: Instructions.h:373
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:400
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:692
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:443
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:269
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:608
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:513
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:953
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1011
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1021
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:125
bool canUnroll() const
Whether it is legal to unroll this loop.
Definition: UnrollLoop.h:138
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:140
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:926
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:316
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:64
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:788
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
AddressSpace
Definition: NVPTXBaseInfo.h:21
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:832
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:593
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
AtomicReductionGenTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenTy ReductionGen
Callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * NumTeams
The number of teams.
Value * DynCGGroupMem
The size of the dynamic shared memory.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
Value * NumThreads
The number of threads.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:183
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57