LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
45#include "llvm/IR/Value.h"
57
58#include <cstdint>
59#include <optional>
60#include <stack>
61
62#define DEBUG_TYPE "openmp-ir-builder"
63
64using namespace llvm;
65using namespace omp;
66
67static cl::opt<bool>
68 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
69 cl::desc("Use optimistic attributes describing "
70 "'as-if' properties of runtime calls."),
71 cl::init(false));
72
74 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
75 cl::desc("Factor for the unroll threshold to account for code "
76 "simplifications still taking place"),
77 cl::init(1.5));
78
79#ifndef NDEBUG
80/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
81/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
82/// an InsertPoint stores the instruction before something is inserted. For
83/// instance, if both point to the same instruction, two IRBuilders alternating
84/// creating instruction will cause the instructions to be interleaved.
87 if (!IP1.isSet() || !IP2.isSet())
88 return false;
89 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
90}
91
93 // Valid ordered/unordered and base algorithm combinations.
94 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
95 case OMPScheduleType::UnorderedStaticChunked:
96 case OMPScheduleType::UnorderedStatic:
97 case OMPScheduleType::UnorderedDynamicChunked:
98 case OMPScheduleType::UnorderedGuidedChunked:
99 case OMPScheduleType::UnorderedRuntime:
100 case OMPScheduleType::UnorderedAuto:
101 case OMPScheduleType::UnorderedTrapezoidal:
102 case OMPScheduleType::UnorderedGreedy:
103 case OMPScheduleType::UnorderedBalanced:
104 case OMPScheduleType::UnorderedGuidedIterativeChunked:
105 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
106 case OMPScheduleType::UnorderedSteal:
107 case OMPScheduleType::UnorderedStaticBalancedChunked:
108 case OMPScheduleType::UnorderedGuidedSimd:
109 case OMPScheduleType::UnorderedRuntimeSimd:
110 case OMPScheduleType::OrderedStaticChunked:
111 case OMPScheduleType::OrderedStatic:
112 case OMPScheduleType::OrderedDynamicChunked:
113 case OMPScheduleType::OrderedGuidedChunked:
114 case OMPScheduleType::OrderedRuntime:
115 case OMPScheduleType::OrderedAuto:
116 case OMPScheduleType::OrderdTrapezoidal:
117 case OMPScheduleType::NomergeUnorderedStaticChunked:
118 case OMPScheduleType::NomergeUnorderedStatic:
119 case OMPScheduleType::NomergeUnorderedDynamicChunked:
120 case OMPScheduleType::NomergeUnorderedGuidedChunked:
121 case OMPScheduleType::NomergeUnorderedRuntime:
122 case OMPScheduleType::NomergeUnorderedAuto:
123 case OMPScheduleType::NomergeUnorderedTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedGreedy:
125 case OMPScheduleType::NomergeUnorderedBalanced:
126 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
128 case OMPScheduleType::NomergeUnorderedSteal:
129 case OMPScheduleType::NomergeOrderedStaticChunked:
130 case OMPScheduleType::NomergeOrderedStatic:
131 case OMPScheduleType::NomergeOrderedDynamicChunked:
132 case OMPScheduleType::NomergeOrderedGuidedChunked:
133 case OMPScheduleType::NomergeOrderedRuntime:
134 case OMPScheduleType::NomergeOrderedAuto:
135 case OMPScheduleType::NomergeOrderedTrapezoidal:
136 break;
137 default:
138 return false;
139 }
140
141 // Must not set both monotonicity modifiers at the same time.
142 OMPScheduleType MonotonicityFlags =
143 SchedType & OMPScheduleType::MonotonicityMask;
144 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
145 return false;
146
147 return true;
148}
149#endif
150
151static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
152 if (T.isAMDGPU()) {
153 StringRef Features =
154 Kernel->getFnAttribute("target-features").getValueAsString();
155 if (Features.count("+wavefrontsize64"))
156 return omp::getAMDGPUGridValues<64>();
157 return omp::getAMDGPUGridValues<32>();
158 }
159 if (T.isNVPTX())
161 llvm_unreachable("No grid value available for this architecture!");
162}
163
164/// Determine which scheduling algorithm to use, determined from schedule clause
165/// arguments.
166static OMPScheduleType
167getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
168 bool HasSimdModifier) {
169 // Currently, the default schedule it static.
170 switch (ClauseKind) {
171 case OMP_SCHEDULE_Default:
172 case OMP_SCHEDULE_Static:
173 return HasChunks ? OMPScheduleType::BaseStaticChunked
174 : OMPScheduleType::BaseStatic;
175 case OMP_SCHEDULE_Dynamic:
176 return OMPScheduleType::BaseDynamicChunked;
177 case OMP_SCHEDULE_Guided:
178 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
179 : OMPScheduleType::BaseGuidedChunked;
180 case OMP_SCHEDULE_Auto:
182 case OMP_SCHEDULE_Runtime:
183 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
184 : OMPScheduleType::BaseRuntime;
185 }
186 llvm_unreachable("unhandled schedule clause argument");
187}
188
189/// Adds ordering modifier flags to schedule type.
190static OMPScheduleType
192 bool HasOrderedClause) {
193 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
194 OMPScheduleType::None &&
195 "Must not have ordering nor monotonicity flags already set");
196
197 OMPScheduleType OrderingModifier = HasOrderedClause
198 ? OMPScheduleType::ModifierOrdered
199 : OMPScheduleType::ModifierUnordered;
200 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
201
202 // Unsupported combinations
203 if (OrderingScheduleType ==
204 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedGuidedChunked;
206 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
207 OMPScheduleType::ModifierOrdered))
208 return OMPScheduleType::OrderedRuntime;
209
210 return OrderingScheduleType;
211}
212
213/// Adds monotonicity modifier flags to schedule type.
214static OMPScheduleType
216 bool HasSimdModifier, bool HasMonotonic,
217 bool HasNonmonotonic, bool HasOrderedClause) {
218 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
219 OMPScheduleType::None &&
220 "Must not have monotonicity flags already set");
221 assert((!HasMonotonic || !HasNonmonotonic) &&
222 "Monotonic and Nonmonotonic are contradicting each other");
223
224 if (HasMonotonic) {
225 return ScheduleType | OMPScheduleType::ModifierMonotonic;
226 } else if (HasNonmonotonic) {
227 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
228 } else {
229 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
230 // If the static schedule kind is specified or if the ordered clause is
231 // specified, and if the nonmonotonic modifier is not specified, the
232 // effect is as if the monotonic modifier is specified. Otherwise, unless
233 // the monotonic modifier is specified, the effect is as if the
234 // nonmonotonic modifier is specified.
235 OMPScheduleType BaseScheduleType =
236 ScheduleType & ~OMPScheduleType::ModifierMask;
237 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
238 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
239 HasOrderedClause) {
240 // The monotonic is used by default in openmp runtime library, so no need
241 // to set it.
242 return ScheduleType;
243 } else {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 }
246 }
247}
248
249/// Determine the schedule type using schedule and ordering clause arguments.
250static OMPScheduleType
251computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
252 bool HasSimdModifier, bool HasMonotonicModifier,
253 bool HasNonmonotonicModifier, bool HasOrderedClause) {
254 OMPScheduleType BaseSchedule =
255 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
256 OMPScheduleType OrderedSchedule =
257 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
259 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
260 HasNonmonotonicModifier, HasOrderedClause);
261
263 return Result;
264}
265
266/// Make \p Source branch to \p Target.
267///
268/// Handles two situations:
269/// * \p Source already has an unconditional branch.
270/// * \p Source is a degenerate block (no terminator because the BB is
271/// the current head of the IR construction).
273 if (Instruction *Term = Source->getTerminator()) {
274 auto *Br = cast<BranchInst>(Term);
275 assert(!Br->isConditional() &&
276 "BB's terminator must be an unconditional branch (or degenerate)");
277 BasicBlock *Succ = Br->getSuccessor(0);
278 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
279 Br->setSuccessor(0, Target);
280 return;
281 }
282
283 auto *NewBr = BranchInst::Create(Target, Source);
284 NewBr->setDebugLoc(DL);
285}
286
288 bool CreateBranch) {
289 assert(New->getFirstInsertionPt() == New->begin() &&
290 "Target BB must not have PHI nodes");
291
292 // Move instructions to new block.
293 BasicBlock *Old = IP.getBlock();
294 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
295
296 if (CreateBranch)
297 BranchInst::Create(New, Old);
298}
299
300void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
302 BasicBlock *Old = Builder.GetInsertBlock();
303
304 spliceBB(Builder.saveIP(), New, CreateBranch);
305 if (CreateBranch)
306 Builder.SetInsertPoint(Old->getTerminator());
307 else
308 Builder.SetInsertPoint(Old);
309
310 // SetInsertPoint also updates the Builder's debug location, but we want to
311 // keep the one the Builder was configured to use.
313}
314
317 BasicBlock *Old = IP.getBlock();
319 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
320 Old->getParent(), Old->getNextNode());
321 spliceBB(IP, New, CreateBranch);
322 New->replaceSuccessorsPhiUsesWith(Old, New);
323 return New;
324}
325
326BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
329 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
330 if (CreateBranch)
331 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
332 else
333 Builder.SetInsertPoint(Builder.GetInsertBlock());
334 // SetInsertPoint also updates the Builder's debug location, but we want to
335 // keep the one the Builder was configured to use.
337 return New;
338}
339
340BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
343 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
344 if (CreateBranch)
345 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
346 else
347 Builder.SetInsertPoint(Builder.GetInsertBlock());
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
351 return New;
352}
353
355 llvm::Twine Suffix) {
356 BasicBlock *Old = Builder.GetInsertBlock();
357 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
358}
359
360// This function creates a fake integer value and a fake use for the integer
361// value. It returns the fake value created. This is useful in modeling the
362// extra arguments to the outlined functions.
364 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
366 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
367 const Twine &Name = "", bool AsPtr = true) {
368 Builder.restoreIP(OuterAllocaIP);
369 Instruction *FakeVal;
370 AllocaInst *FakeValAddr =
371 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
372 ToBeDeleted.push_back(FakeValAddr);
373
374 if (AsPtr) {
375 FakeVal = FakeValAddr;
376 } else {
377 FakeVal =
378 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
379 ToBeDeleted.push_back(FakeVal);
380 }
381
382 // Generate a fake use of this value
383 Builder.restoreIP(InnerAllocaIP);
384 Instruction *UseFakeVal;
385 if (AsPtr) {
386 UseFakeVal =
387 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
388 } else {
389 UseFakeVal =
390 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
391 }
392 ToBeDeleted.push_back(UseFakeVal);
393 return FakeVal;
394}
395
396//===----------------------------------------------------------------------===//
397// OpenMPIRBuilderConfig
398//===----------------------------------------------------------------------===//
399
400namespace {
402/// Values for bit flags for marking which requires clauses have been used.
403enum OpenMPOffloadingRequiresDirFlags {
404 /// flag undefined.
405 OMP_REQ_UNDEFINED = 0x000,
406 /// no requires directive present.
407 OMP_REQ_NONE = 0x001,
408 /// reverse_offload clause.
409 OMP_REQ_REVERSE_OFFLOAD = 0x002,
410 /// unified_address clause.
411 OMP_REQ_UNIFIED_ADDRESS = 0x004,
412 /// unified_shared_memory clause.
413 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
414 /// dynamic_allocators clause.
415 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
416 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
417};
418
419} // anonymous namespace
420
422 : RequiresFlags(OMP_REQ_UNDEFINED) {}
423
425 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
426 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
427 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
428 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
429 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
430 RequiresFlags(OMP_REQ_UNDEFINED) {
431 if (HasRequiresReverseOffload)
432 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
433 if (HasRequiresUnifiedAddress)
434 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
435 if (HasRequiresUnifiedSharedMemory)
436 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
437 if (HasRequiresDynamicAllocators)
438 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
439}
440
442 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
443}
444
446 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
447}
448
450 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
451}
452
454 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
455}
456
458 return hasRequiresFlags() ? RequiresFlags
459 : static_cast<int64_t>(OMP_REQ_NONE);
460}
461
463 if (Value)
464 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
465 else
466 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
467}
468
470 if (Value)
471 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
472 else
473 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
474}
475
477 if (Value)
478 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
479 else
480 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 if (Value)
485 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
486 else
487 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
488}
489
490//===----------------------------------------------------------------------===//
491// OpenMPIRBuilder
492//===----------------------------------------------------------------------===//
493
495 IRBuilderBase &Builder,
496 SmallVector<Value *> &ArgsVector) {
498 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
499 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
500 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
501 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
502
503 Value *NumTeams3D =
504 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
505 Value *NumThreads3D =
506 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
507
508 ArgsVector = {Version,
509 PointerNum,
510 KernelArgs.RTArgs.BasePointersArray,
511 KernelArgs.RTArgs.PointersArray,
512 KernelArgs.RTArgs.SizesArray,
513 KernelArgs.RTArgs.MapTypesArray,
514 KernelArgs.RTArgs.MapNamesArray,
515 KernelArgs.RTArgs.MappersArray,
516 KernelArgs.NumIterations,
517 Flags,
518 NumTeams3D,
519 NumThreads3D,
520 KernelArgs.DynCGGroupMem};
521}
522
524 LLVMContext &Ctx = Fn.getContext();
525
526 // Get the function's current attributes.
527 auto Attrs = Fn.getAttributes();
528 auto FnAttrs = Attrs.getFnAttrs();
529 auto RetAttrs = Attrs.getRetAttrs();
531 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
532 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
533
534 // Add AS to FnAS while taking special care with integer extensions.
535 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
536 bool Param = true) -> void {
537 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
538 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
539 if (HasSignExt || HasZeroExt) {
540 assert(AS.getNumAttributes() == 1 &&
541 "Currently not handling extension attr combined with others.");
542 if (Param) {
543 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
544 FnAS = FnAS.addAttribute(Ctx, AK);
545 } else if (auto AK =
546 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
547 FnAS = FnAS.addAttribute(Ctx, AK);
548 } else {
549 FnAS = FnAS.addAttributes(Ctx, AS);
550 }
551 };
552
553#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
554#include "llvm/Frontend/OpenMP/OMPKinds.def"
555
556 // Add attributes to the function declaration.
557 switch (FnID) {
558#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
559 case Enum: \
560 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
561 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
562 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
563 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
564 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
565 break;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567 default:
568 // Attributes are optional.
569 break;
570 }
571}
572
575 FunctionType *FnTy = nullptr;
576 Function *Fn = nullptr;
577
578 // Try to find the declation in the module first.
579 switch (FnID) {
580#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
581 case Enum: \
582 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
583 IsVarArg); \
584 Fn = M.getFunction(Str); \
585 break;
586#include "llvm/Frontend/OpenMP/OMPKinds.def"
587 }
588
589 if (!Fn) {
590 // Create a new declaration if we need one.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, ...) \
593 case Enum: \
594 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
595 break;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597 }
598
599 // Add information if the runtime function takes a callback function
600 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
601 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
602 LLVMContext &Ctx = Fn->getContext();
603 MDBuilder MDB(Ctx);
604 // Annotate the callback behavior of the runtime function:
605 // - The callback callee is argument number 2 (microtask).
606 // - The first two arguments of the callback callee are unknown (-1).
607 // - All variadic arguments to the runtime function are passed to the
608 // callback callee.
609 Fn->addMetadata(
610 LLVMContext::MD_callback,
612 2, {-1, -1}, /* VarArgsArePassed */ true)}));
613 }
614 }
615
616 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
617 << " with type " << *Fn->getFunctionType() << "\n");
618 addAttributes(FnID, *Fn);
619
620 } else {
621 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
622 << " with type " << *Fn->getFunctionType() << "\n");
623 }
624
625 assert(Fn && "Failed to create OpenMP runtime function");
626
627 return {FnTy, Fn};
628}
629
632 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
633 assert(Fn && "Failed to create OpenMP runtime function pointer");
634 return Fn;
635}
636
637void OpenMPIRBuilder::initialize() { initializeTypes(M); }
638
641 BasicBlock &EntryBlock = Function->getEntryBlock();
642 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
643
644 // Loop over blocks looking for constant allocas, skipping the entry block
645 // as any allocas there are already in the desired location.
646 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
647 Block++) {
648 for (auto Inst = Block->getReverseIterator()->begin();
649 Inst != Block->getReverseIterator()->end();) {
650 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
651 Inst++;
652 if (!isa<ConstantData>(AllocaInst->getArraySize()))
653 continue;
654 AllocaInst->moveBeforePreserving(MoveLocInst);
655 } else {
656 Inst++;
657 }
658 }
659 }
660}
661
663 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
665 SmallVector<OutlineInfo, 16> DeferredOutlines;
666 for (OutlineInfo &OI : OutlineInfos) {
667 // Skip functions that have not finalized yet; may happen with nested
668 // function generation.
669 if (Fn && OI.getFunction() != Fn) {
670 DeferredOutlines.push_back(OI);
671 continue;
672 }
673
674 ParallelRegionBlockSet.clear();
675 Blocks.clear();
676 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
677
678 Function *OuterFn = OI.getFunction();
679 CodeExtractorAnalysisCache CEAC(*OuterFn);
680 // If we generate code for the target device, we need to allocate
681 // struct for aggregate params in the device default alloca address space.
682 // OpenMP runtime requires that the params of the extracted functions are
683 // passed as zero address space pointers. This flag ensures that
684 // CodeExtractor generates correct code for extracted functions
685 // which are used by OpenMP runtime.
686 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
687 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
688 /* AggregateArgs */ true,
689 /* BlockFrequencyInfo */ nullptr,
690 /* BranchProbabilityInfo */ nullptr,
691 /* AssumptionCache */ nullptr,
692 /* AllowVarArgs */ true,
693 /* AllowAlloca */ true,
694 /* AllocaBlock*/ OI.OuterAllocaBB,
695 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
696
697 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
698 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
699 << " Exit: " << OI.ExitBB->getName() << "\n");
700 assert(Extractor.isEligible() &&
701 "Expected OpenMP outlining to be possible!");
702
703 for (auto *V : OI.ExcludeArgsFromAggregate)
704 Extractor.excludeArgFromAggregate(V);
705
706 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
707
708 // Forward target-cpu, target-features attributes to the outlined function.
709 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
710 if (TargetCpuAttr.isStringAttribute())
711 OutlinedFn->addFnAttr(TargetCpuAttr);
712
713 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
714 if (TargetFeaturesAttr.isStringAttribute())
715 OutlinedFn->addFnAttr(TargetFeaturesAttr);
716
717 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
718 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
719 assert(OutlinedFn->getReturnType()->isVoidTy() &&
720 "OpenMP outlined functions should not return a value!");
721
722 // For compability with the clang CG we move the outlined function after the
723 // one with the parallel region.
724 OutlinedFn->removeFromParent();
725 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
726
727 // Remove the artificial entry introduced by the extractor right away, we
728 // made our own entry block after all.
729 {
730 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
731 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
732 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
733 // Move instructions from the to-be-deleted ArtificialEntry to the entry
734 // basic block of the parallel region. CodeExtractor generates
735 // instructions to unwrap the aggregate argument and may sink
736 // allocas/bitcasts for values that are solely used in the outlined region
737 // and do not escape.
738 assert(!ArtificialEntry.empty() &&
739 "Expected instructions to add in the outlined region entry");
740 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
741 End = ArtificialEntry.rend();
742 It != End;) {
743 Instruction &I = *It;
744 It++;
745
746 if (I.isTerminator())
747 continue;
748
749 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
750 }
751
752 OI.EntryBB->moveBefore(&ArtificialEntry);
753 ArtificialEntry.eraseFromParent();
754 }
755 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
756 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
757
758 // Run a user callback, e.g. to add attributes.
759 if (OI.PostOutlineCB)
760 OI.PostOutlineCB(*OutlinedFn);
761 }
762
763 // Remove work items that have been completed.
764 OutlineInfos = std::move(DeferredOutlines);
765
766 // The createTarget functions embeds user written code into
767 // the target region which may inject allocas which need to
768 // be moved to the entry block of our target or risk malformed
769 // optimisations by later passes, this is only relevant for
770 // the device pass which appears to be a little more delicate
771 // when it comes to optimisations (however, we do not block on
772 // that here, it's up to the inserter to the list to do so).
773 // This notbaly has to occur after the OutlinedInfo candidates
774 // have been extracted so we have an end product that will not
775 // be implicitly adversely affected by any raises unless
776 // intentionally appended to the list.
777 // NOTE: This only does so for ConstantData, it could be extended
778 // to ConstantExpr's with further effort, however, they should
779 // largely be folded when they get here. Extending it to runtime
780 // defined/read+writeable allocation sizes would be non-trivial
781 // (need to factor in movement of any stores to variables the
782 // allocation size depends on, as well as the usual loads,
783 // otherwise it'll yield the wrong result after movement) and
784 // likely be more suitable as an LLVM optimisation pass.
787
788 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
789 [](EmitMetadataErrorKind Kind,
790 const TargetRegionEntryInfo &EntryInfo) -> void {
791 errs() << "Error of kind: " << Kind
792 << " when emitting offload entries and metadata during "
793 "OMPIRBuilder finalization \n";
794 };
795
798
799 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
800 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
801 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
802 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
803 }
804}
805
807 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
808}
809
812 auto *GV =
813 new GlobalVariable(M, I32Ty,
814 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
815 ConstantInt::get(I32Ty, Value), Name);
816 GV->setVisibility(GlobalValue::HiddenVisibility);
817
818 return GV;
819}
820
822 uint32_t SrcLocStrSize,
823 IdentFlag LocFlags,
824 unsigned Reserve2Flags) {
825 // Enable "C-mode".
826 LocFlags |= OMP_IDENT_FLAG_KMPC;
827
828 Constant *&Ident =
829 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
830 if (!Ident) {
832 Constant *IdentData[] = {I32Null,
833 ConstantInt::get(Int32, uint32_t(LocFlags)),
834 ConstantInt::get(Int32, Reserve2Flags),
835 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
836 Constant *Initializer =
837 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
838
839 // Look for existing encoding of the location + flags, not needed but
840 // minimizes the difference to the existing solution while we transition.
841 for (GlobalVariable &GV : M.globals())
842 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
843 if (GV.getInitializer() == Initializer)
844 Ident = &GV;
845
846 if (!Ident) {
847 auto *GV = new GlobalVariable(
848 M, OpenMPIRBuilder::Ident,
849 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
852 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
853 GV->setAlignment(Align(8));
854 Ident = GV;
855 }
856 }
857
859}
860
862 uint32_t &SrcLocStrSize) {
863 SrcLocStrSize = LocStr.size();
864 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
865 if (!SrcLocStr) {
866 Constant *Initializer =
868
869 // Look for existing encoding of the location, not needed but minimizes the
870 // difference to the existing solution while we transition.
871 for (GlobalVariable &GV : M.globals())
872 if (GV.isConstant() && GV.hasInitializer() &&
873 GV.getInitializer() == Initializer)
874 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
875
876 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
877 /* AddressSpace */ 0, &M);
878 }
879 return SrcLocStr;
880}
881
883 StringRef FileName,
884 unsigned Line, unsigned Column,
885 uint32_t &SrcLocStrSize) {
886 SmallString<128> Buffer;
887 Buffer.push_back(';');
888 Buffer.append(FileName);
889 Buffer.push_back(';');
890 Buffer.append(FunctionName);
891 Buffer.push_back(';');
892 Buffer.append(std::to_string(Line));
893 Buffer.push_back(';');
894 Buffer.append(std::to_string(Column));
895 Buffer.push_back(';');
896 Buffer.push_back(';');
897 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
898}
899
900Constant *
902 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
903 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
904}
905
907 uint32_t &SrcLocStrSize,
908 Function *F) {
909 DILocation *DIL = DL.get();
910 if (!DIL)
911 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
912 StringRef FileName = M.getName();
913 if (DIFile *DIF = DIL->getFile())
914 if (std::optional<StringRef> Source = DIF->getSource())
915 FileName = *Source;
916 StringRef Function = DIL->getScope()->getSubprogram()->getName();
917 if (Function.empty() && F)
918 Function = F->getName();
919 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
920 DIL->getColumn(), SrcLocStrSize);
921}
922
924 uint32_t &SrcLocStrSize) {
925 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
926 Loc.IP.getBlock()->getParent());
927}
928
930 return Builder.CreateCall(
931 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
932 "omp_global_thread_num");
933}
934
937 bool ForceSimpleCall, bool CheckCancelFlag) {
938 if (!updateToLocation(Loc))
939 return Loc.IP;
940
941 // Build call __kmpc_cancel_barrier(loc, thread_id) or
942 // __kmpc_barrier(loc, thread_id);
943
944 IdentFlag BarrierLocFlags;
945 switch (Kind) {
946 case OMPD_for:
947 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
948 break;
949 case OMPD_sections:
950 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
951 break;
952 case OMPD_single:
953 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
954 break;
955 case OMPD_barrier:
956 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
957 break;
958 default:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
960 break;
961 }
962
963 uint32_t SrcLocStrSize;
964 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
965 Value *Args[] = {
966 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
967 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
968
969 // If we are in a cancellable parallel region, barriers are cancellation
970 // points.
971 // TODO: Check why we would force simple calls or to ignore the cancel flag.
972 bool UseCancelBarrier =
973 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
974
975 Value *Result =
977 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
978 : OMPRTL___kmpc_barrier),
979 Args);
980
981 if (UseCancelBarrier && CheckCancelFlag)
982 emitCancelationCheckImpl(Result, OMPD_parallel);
983
984 return Builder.saveIP();
985}
986
989 Value *IfCondition,
990 omp::Directive CanceledDirective) {
991 if (!updateToLocation(Loc))
992 return Loc.IP;
993
994 // LLVM utilities like blocks with terminators.
995 auto *UI = Builder.CreateUnreachable();
996
997 Instruction *ThenTI = UI, *ElseTI = nullptr;
998 if (IfCondition)
999 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1000 Builder.SetInsertPoint(ThenTI);
1001
1002 Value *CancelKind = nullptr;
1003 switch (CanceledDirective) {
1004#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1005 case DirectiveEnum: \
1006 CancelKind = Builder.getInt32(Value); \
1007 break;
1008#include "llvm/Frontend/OpenMP/OMPKinds.def"
1009 default:
1010 llvm_unreachable("Unknown cancel kind!");
1011 }
1012
1013 uint32_t SrcLocStrSize;
1014 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1015 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1016 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1017 Value *Result = Builder.CreateCall(
1018 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1019 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1020 if (CanceledDirective == OMPD_parallel) {
1022 Builder.restoreIP(IP);
1024 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1025 /* CheckCancelFlag */ false);
1026 }
1027 };
1028
1029 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1030 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1031
1032 // Update the insertion point and remove the terminator we introduced.
1033 Builder.SetInsertPoint(UI->getParent());
1034 UI->eraseFromParent();
1035
1036 return Builder.saveIP();
1037}
1038
1040 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1041 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1042 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1043 if (!updateToLocation(Loc))
1044 return Loc.IP;
1045
1046 Builder.restoreIP(AllocaIP);
1047 auto *KernelArgsPtr =
1048 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1049 Builder.restoreIP(Loc.IP);
1050
1051 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1052 llvm::Value *Arg =
1053 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1055 KernelArgs[I], Arg,
1056 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1057 }
1058
1059 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1060 NumThreads, HostPtr, KernelArgsPtr};
1061
1062 Return = Builder.CreateCall(
1063 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1064 OffloadingArgs);
1065
1066 return Builder.saveIP();
1067}
1068
1070 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1071 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1072 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1073
1074 if (!updateToLocation(Loc))
1075 return Loc.IP;
1076
1077 Builder.restoreIP(Loc.IP);
1078 // On top of the arrays that were filled up, the target offloading call
1079 // takes as arguments the device id as well as the host pointer. The host
1080 // pointer is used by the runtime library to identify the current target
1081 // region, so it only has to be unique and not necessarily point to
1082 // anything. It could be the pointer to the outlined function that
1083 // implements the target region, but we aren't using that so that the
1084 // compiler doesn't need to keep that, and could therefore inline the host
1085 // function if proven worthwhile during optimization.
1086
1087 // From this point on, we need to have an ID of the target region defined.
1088 assert(OutlinedFnID && "Invalid outlined function ID!");
1089 (void)OutlinedFnID;
1090
1091 // Return value of the runtime offloading call.
1092 Value *Return = nullptr;
1093
1094 // Arguments for the target kernel.
1095 SmallVector<Value *> ArgsVector;
1096 getKernelArgsVector(Args, Builder, ArgsVector);
1097
1098 // The target region is an outlined function launched by the runtime
1099 // via calls to __tgt_target_kernel().
1100 //
1101 // Note that on the host and CPU targets, the runtime implementation of
1102 // these calls simply call the outlined function without forking threads.
1103 // The outlined functions themselves have runtime calls to
1104 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1105 // the compiler in emitTeamsCall() and emitParallelCall().
1106 //
1107 // In contrast, on the NVPTX target, the implementation of
1108 // __tgt_target_teams() launches a GPU kernel with the requested number
1109 // of teams and threads so no additional calls to the runtime are required.
1110 // Check the error code and execute the host version if required.
1111 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1112 Args.NumTeams, Args.NumThreads,
1113 OutlinedFnID, ArgsVector));
1114
1115 BasicBlock *OffloadFailedBlock =
1116 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1117 BasicBlock *OffloadContBlock =
1118 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1120 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1121
1122 auto CurFn = Builder.GetInsertBlock()->getParent();
1123 emitBlock(OffloadFailedBlock, CurFn);
1124 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1125 emitBranch(OffloadContBlock);
1126 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1127 return Builder.saveIP();
1128}
1129
1131 omp::Directive CanceledDirective,
1132 FinalizeCallbackTy ExitCB) {
1133 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1134 "Unexpected cancellation!");
1135
1136 // For a cancel barrier we create two new blocks.
1138 BasicBlock *NonCancellationBlock;
1139 if (Builder.GetInsertPoint() == BB->end()) {
1140 // TODO: This branch will not be needed once we moved to the
1141 // OpenMPIRBuilder codegen completely.
1142 NonCancellationBlock = BasicBlock::Create(
1143 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1144 } else {
1145 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1148 }
1149 BasicBlock *CancellationBlock = BasicBlock::Create(
1150 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1151
1152 // Jump to them based on the return value.
1153 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1154 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1155 /* TODO weight */ nullptr, nullptr);
1156
1157 // From the cancellation block we finalize all variables and go to the
1158 // post finalization block that is known to the FiniCB callback.
1159 Builder.SetInsertPoint(CancellationBlock);
1160 if (ExitCB)
1161 ExitCB(Builder.saveIP());
1162 auto &FI = FinalizationStack.back();
1163 FI.FiniCB(Builder.saveIP());
1164
1165 // The continuation block is where code generation continues.
1166 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1167}
1168
1169// Callback used to create OpenMP runtime calls to support
1170// omp parallel clause for the device.
1171// We need to use this callback to replace call to the OutlinedFn in OuterFn
1172// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1174 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1175 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1176 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1177 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1178 // Add some known attributes.
1179 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1180 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1181 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1182 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1183 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1184 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1185
1186 assert(OutlinedFn.arg_size() >= 2 &&
1187 "Expected at least tid and bounded tid as arguments");
1188 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1189
1190 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1191 assert(CI && "Expected call instruction to outlined function");
1192 CI->getParent()->setName("omp_parallel");
1193
1194 Builder.SetInsertPoint(CI);
1195 Type *PtrTy = OMPIRBuilder->VoidPtr;
1196 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1197
1198 // Add alloca for kernel args
1199 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1200 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1201 AllocaInst *ArgsAlloca =
1202 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1203 Value *Args = ArgsAlloca;
1204 // Add address space cast if array for storing arguments is not allocated
1205 // in address space 0
1206 if (ArgsAlloca->getAddressSpace())
1207 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1208 Builder.restoreIP(CurrentIP);
1209
1210 // Store captured vars which are used by kmpc_parallel_51
1211 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1212 Value *V = *(CI->arg_begin() + 2 + Idx);
1213 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1214 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1215 Builder.CreateStore(V, StoreAddress);
1216 }
1217
1218 Value *Cond =
1219 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1220 : Builder.getInt32(1);
1221
1222 // Build kmpc_parallel_51 call
1223 Value *Parallel51CallArgs[] = {
1224 /* identifier*/ Ident,
1225 /* global thread num*/ ThreadID,
1226 /* if expression */ Cond,
1227 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1228 /* Proc bind */ Builder.getInt32(-1),
1229 /* outlined function */
1230 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1231 /* wrapper function */ NullPtrValue,
1232 /* arguments of the outlined funciton*/ Args,
1233 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1234
1235 FunctionCallee RTLFn =
1236 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1237
1238 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1239
1240 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1241 << *Builder.GetInsertBlock()->getParent() << "\n");
1242
1243 // Initialize the local TID stack location with the argument value.
1244 Builder.SetInsertPoint(PrivTID);
1245 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1246 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1247 PrivTIDAddr);
1248
1249 // Remove redundant call to the outlined function.
1250 CI->eraseFromParent();
1251
1252 for (Instruction *I : ToBeDeleted) {
1253 I->eraseFromParent();
1254 }
1255}
1256
1257// Callback used to create OpenMP runtime calls to support
1258// omp parallel clause for the host.
1259// We need to use this callback to replace call to the OutlinedFn in OuterFn
1260// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1261static void
1263 Function *OuterFn, Value *Ident, Value *IfCondition,
1264 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1265 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1266 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1267 FunctionCallee RTLFn;
1268 if (IfCondition) {
1269 RTLFn =
1270 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1271 } else {
1272 RTLFn =
1273 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1274 }
1275 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1276 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1277 LLVMContext &Ctx = F->getContext();
1278 MDBuilder MDB(Ctx);
1279 // Annotate the callback behavior of the __kmpc_fork_call:
1280 // - The callback callee is argument number 2 (microtask).
1281 // - The first two arguments of the callback callee are unknown (-1).
1282 // - All variadic arguments to the __kmpc_fork_call are passed to the
1283 // callback callee.
1284 F->addMetadata(LLVMContext::MD_callback,
1286 2, {-1, -1},
1287 /* VarArgsArePassed */ true)}));
1288 }
1289 }
1290 // Add some known attributes.
1291 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1292 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1293 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1294
1295 assert(OutlinedFn.arg_size() >= 2 &&
1296 "Expected at least tid and bounded tid as arguments");
1297 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1298
1299 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1300 CI->getParent()->setName("omp_parallel");
1301 Builder.SetInsertPoint(CI);
1302
1303 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1304 Value *ForkCallArgs[] = {
1305 Ident, Builder.getInt32(NumCapturedVars),
1306 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1307
1308 SmallVector<Value *, 16> RealArgs;
1309 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1310 if (IfCondition) {
1311 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1312 RealArgs.push_back(Cond);
1313 }
1314 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1315
1316 // __kmpc_fork_call_if always expects a void ptr as the last argument
1317 // If there are no arguments, pass a null pointer.
1318 auto PtrTy = OMPIRBuilder->VoidPtr;
1319 if (IfCondition && NumCapturedVars == 0) {
1320 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1321 RealArgs.push_back(NullPtrValue);
1322 }
1323 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1324 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1325
1326 Builder.CreateCall(RTLFn, RealArgs);
1327
1328 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1329 << *Builder.GetInsertBlock()->getParent() << "\n");
1330
1331 // Initialize the local TID stack location with the argument value.
1332 Builder.SetInsertPoint(PrivTID);
1333 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1334 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1335 PrivTIDAddr);
1336
1337 // Remove redundant call to the outlined function.
1338 CI->eraseFromParent();
1339
1340 for (Instruction *I : ToBeDeleted) {
1341 I->eraseFromParent();
1342 }
1343}
1344
1346 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1347 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1348 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1349 omp::ProcBindKind ProcBind, bool IsCancellable) {
1350 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1351
1352 if (!updateToLocation(Loc))
1353 return Loc.IP;
1354
1355 uint32_t SrcLocStrSize;
1356 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1357 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1358 Value *ThreadID = getOrCreateThreadID(Ident);
1359 // If we generate code for the target device, we need to allocate
1360 // struct for aggregate params in the device default alloca address space.
1361 // OpenMP runtime requires that the params of the extracted functions are
1362 // passed as zero address space pointers. This flag ensures that extracted
1363 // function arguments are declared in zero address space
1364 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1365
1366 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1367 // only if we compile for host side.
1368 if (NumThreads && !Config.isTargetDevice()) {
1369 Value *Args[] = {
1370 Ident, ThreadID,
1371 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1373 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1374 }
1375
1376 if (ProcBind != OMP_PROC_BIND_default) {
1377 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1378 Value *Args[] = {
1379 Ident, ThreadID,
1380 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1382 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1383 }
1384
1385 BasicBlock *InsertBB = Builder.GetInsertBlock();
1386 Function *OuterFn = InsertBB->getParent();
1387
1388 // Save the outer alloca block because the insertion iterator may get
1389 // invalidated and we still need this later.
1390 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1391
1392 // Vector to remember instructions we used only during the modeling but which
1393 // we want to delete at the end.
1395
1396 // Change the location to the outer alloca insertion point to create and
1397 // initialize the allocas we pass into the parallel region.
1398 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1399 Builder.restoreIP(NewOuter);
1400 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1401 AllocaInst *ZeroAddrAlloca =
1402 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1403 Instruction *TIDAddr = TIDAddrAlloca;
1404 Instruction *ZeroAddr = ZeroAddrAlloca;
1405 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1406 // Add additional casts to enforce pointers in zero address space
1407 TIDAddr = new AddrSpaceCastInst(
1408 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1409 TIDAddr->insertAfter(TIDAddrAlloca);
1410 ToBeDeleted.push_back(TIDAddr);
1411 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1412 PointerType ::get(M.getContext(), 0),
1413 "zero.addr.ascast");
1414 ZeroAddr->insertAfter(ZeroAddrAlloca);
1415 ToBeDeleted.push_back(ZeroAddr);
1416 }
1417
1418 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1419 // associated arguments in the outlined function, so we delete them later.
1420 ToBeDeleted.push_back(TIDAddrAlloca);
1421 ToBeDeleted.push_back(ZeroAddrAlloca);
1422
1423 // Create an artificial insertion point that will also ensure the blocks we
1424 // are about to split are not degenerated.
1425 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1426
1427 BasicBlock *EntryBB = UI->getParent();
1428 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1429 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1430 BasicBlock *PRegPreFiniBB =
1431 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1432 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1433
1434 auto FiniCBWrapper = [&](InsertPointTy IP) {
1435 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1436 // target to the region exit block.
1437 if (IP.getBlock()->end() == IP.getPoint()) {
1439 Builder.restoreIP(IP);
1440 Instruction *I = Builder.CreateBr(PRegExitBB);
1441 IP = InsertPointTy(I->getParent(), I->getIterator());
1442 }
1443 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1444 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1445 "Unexpected insertion point for finalization call!");
1446 return FiniCB(IP);
1447 };
1448
1449 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1450
1451 // Generate the privatization allocas in the block that will become the entry
1452 // of the outlined function.
1453 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1454 InsertPointTy InnerAllocaIP = Builder.saveIP();
1455
1456 AllocaInst *PrivTIDAddr =
1457 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1458 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1459
1460 // Add some fake uses for OpenMP provided arguments.
1461 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1462 Instruction *ZeroAddrUse =
1463 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1464 ToBeDeleted.push_back(ZeroAddrUse);
1465
1466 // EntryBB
1467 // |
1468 // V
1469 // PRegionEntryBB <- Privatization allocas are placed here.
1470 // |
1471 // V
1472 // PRegionBodyBB <- BodeGen is invoked here.
1473 // |
1474 // V
1475 // PRegPreFiniBB <- The block we will start finalization from.
1476 // |
1477 // V
1478 // PRegionExitBB <- A common exit to simplify block collection.
1479 //
1480
1481 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1482
1483 // Let the caller create the body.
1484 assert(BodyGenCB && "Expected body generation callback!");
1485 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1486 BodyGenCB(InnerAllocaIP, CodeGenIP);
1487
1488 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1489
1490 OutlineInfo OI;
1491 if (Config.isTargetDevice()) {
1492 // Generate OpenMP target specific runtime call
1493 OI.PostOutlineCB = [=, ToBeDeletedVec =
1494 std::move(ToBeDeleted)](Function &OutlinedFn) {
1495 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1496 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1497 ThreadID, ToBeDeletedVec);
1498 };
1499 } else {
1500 // Generate OpenMP host runtime call
1501 OI.PostOutlineCB = [=, ToBeDeletedVec =
1502 std::move(ToBeDeleted)](Function &OutlinedFn) {
1503 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1504 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1505 };
1506 }
1507
1508 OI.OuterAllocaBB = OuterAllocaBlock;
1509 OI.EntryBB = PRegEntryBB;
1510 OI.ExitBB = PRegExitBB;
1511
1512 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1514 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1515
1516 // Ensure a single exit node for the outlined region by creating one.
1517 // We might have multiple incoming edges to the exit now due to finalizations,
1518 // e.g., cancel calls that cause the control flow to leave the region.
1519 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1520 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1521 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1522 Blocks.push_back(PRegOutlinedExitBB);
1523
1524 CodeExtractorAnalysisCache CEAC(*OuterFn);
1525 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1526 /* AggregateArgs */ false,
1527 /* BlockFrequencyInfo */ nullptr,
1528 /* BranchProbabilityInfo */ nullptr,
1529 /* AssumptionCache */ nullptr,
1530 /* AllowVarArgs */ true,
1531 /* AllowAlloca */ true,
1532 /* AllocationBlock */ OuterAllocaBlock,
1533 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1534
1535 // Find inputs to, outputs from the code region.
1536 BasicBlock *CommonExit = nullptr;
1537 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1538 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1539 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1540
1541 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1542
1543 FunctionCallee TIDRTLFn =
1544 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1545
1546 auto PrivHelper = [&](Value &V) {
1547 if (&V == TIDAddr || &V == ZeroAddr) {
1548 OI.ExcludeArgsFromAggregate.push_back(&V);
1549 return;
1550 }
1551
1553 for (Use &U : V.uses())
1554 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1555 if (ParallelRegionBlockSet.count(UserI->getParent()))
1556 Uses.insert(&U);
1557
1558 // __kmpc_fork_call expects extra arguments as pointers. If the input
1559 // already has a pointer type, everything is fine. Otherwise, store the
1560 // value onto stack and load it back inside the to-be-outlined region. This
1561 // will ensure only the pointer will be passed to the function.
1562 // FIXME: if there are more than 15 trailing arguments, they must be
1563 // additionally packed in a struct.
1564 Value *Inner = &V;
1565 if (!V.getType()->isPointerTy()) {
1567 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1568
1569 Builder.restoreIP(OuterAllocaIP);
1570 Value *Ptr =
1571 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1572
1573 // Store to stack at end of the block that currently branches to the entry
1574 // block of the to-be-outlined region.
1575 Builder.SetInsertPoint(InsertBB,
1576 InsertBB->getTerminator()->getIterator());
1577 Builder.CreateStore(&V, Ptr);
1578
1579 // Load back next to allocations in the to-be-outlined region.
1580 Builder.restoreIP(InnerAllocaIP);
1581 Inner = Builder.CreateLoad(V.getType(), Ptr);
1582 }
1583
1584 Value *ReplacementValue = nullptr;
1585 CallInst *CI = dyn_cast<CallInst>(&V);
1586 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1587 ReplacementValue = PrivTID;
1588 } else {
1590 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1591 InnerAllocaIP = {
1592 InnerAllocaIP.getBlock(),
1593 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1594
1595 assert(ReplacementValue &&
1596 "Expected copy/create callback to set replacement value!");
1597 if (ReplacementValue == &V)
1598 return;
1599 }
1600
1601 for (Use *UPtr : Uses)
1602 UPtr->set(ReplacementValue);
1603 };
1604
1605 // Reset the inner alloca insertion as it will be used for loading the values
1606 // wrapped into pointers before passing them into the to-be-outlined region.
1607 // Configure it to insert immediately after the fake use of zero address so
1608 // that they are available in the generated body and so that the
1609 // OpenMP-related values (thread ID and zero address pointers) remain leading
1610 // in the argument list.
1611 InnerAllocaIP = IRBuilder<>::InsertPoint(
1612 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1613
1614 // Reset the outer alloca insertion point to the entry of the relevant block
1615 // in case it was invalidated.
1616 OuterAllocaIP = IRBuilder<>::InsertPoint(
1617 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1618
1619 for (Value *Input : Inputs) {
1620 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1621 PrivHelper(*Input);
1622 }
1623 LLVM_DEBUG({
1624 for (Value *Output : Outputs)
1625 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1626 });
1627 assert(Outputs.empty() &&
1628 "OpenMP outlining should not produce live-out values!");
1629
1630 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1631 LLVM_DEBUG({
1632 for (auto *BB : Blocks)
1633 dbgs() << " PBR: " << BB->getName() << "\n";
1634 });
1635
1636 // Adjust the finalization stack, verify the adjustment, and call the
1637 // finalize function a last time to finalize values between the pre-fini
1638 // block and the exit block if we left the parallel "the normal way".
1639 auto FiniInfo = FinalizationStack.pop_back_val();
1640 (void)FiniInfo;
1641 assert(FiniInfo.DK == OMPD_parallel &&
1642 "Unexpected finalization stack state!");
1643
1644 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1645
1646 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1647 FiniCB(PreFiniIP);
1648
1649 // Register the outlined info.
1650 addOutlineInfo(std::move(OI));
1651
1652 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1653 UI->eraseFromParent();
1654
1655 return AfterIP;
1656}
1657
1659 // Build call void __kmpc_flush(ident_t *loc)
1660 uint32_t SrcLocStrSize;
1661 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1662 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1663
1664 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1665}
1666
1668 if (!updateToLocation(Loc))
1669 return;
1670 emitFlush(Loc);
1671}
1672
1674 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1675 // global_tid);
1676 uint32_t SrcLocStrSize;
1677 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1678 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1679 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1680
1681 // Ignore return result until untied tasks are supported.
1682 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1683 Args);
1684}
1685
1687 if (!updateToLocation(Loc))
1688 return;
1689 emitTaskwaitImpl(Loc);
1690}
1691
1693 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1694 uint32_t SrcLocStrSize;
1695 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1696 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1698 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1699
1700 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1701 Args);
1702}
1703
1705 if (!updateToLocation(Loc))
1706 return;
1707 emitTaskyieldImpl(Loc);
1708}
1709
1710// Processes the dependencies in Dependencies and does the following
1711// - Allocates space on the stack of an array of DependInfo objects
1712// - Populates each DependInfo object with relevant information of
1713// the corresponding dependence.
1714// - All code is inserted in the entry block of the current function.
1716 OpenMPIRBuilder &OMPBuilder,
1718 // Early return if we have no dependencies to process
1719 if (Dependencies.empty())
1720 return nullptr;
1721
1722 // Given a vector of DependData objects, in this function we create an
1723 // array on the stack that holds kmp_dep_info objects corresponding
1724 // to each dependency. This is then passed to the OpenMP runtime.
1725 // For example, if there are 'n' dependencies then the following psedo
1726 // code is generated. Assume the first dependence is on a variable 'a'
1727 //
1728 // \code{c}
1729 // DepArray = alloc(n x sizeof(kmp_depend_info);
1730 // idx = 0;
1731 // DepArray[idx].base_addr = ptrtoint(&a);
1732 // DepArray[idx].len = 8;
1733 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1734 // ++idx;
1735 // DepArray[idx].base_addr = ...;
1736 // \endcode
1737
1738 IRBuilderBase &Builder = OMPBuilder.Builder;
1739 Type *DependInfo = OMPBuilder.DependInfo;
1740 Module &M = OMPBuilder.M;
1741
1742 Value *DepArray = nullptr;
1743 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1744 Builder.SetInsertPoint(
1746
1747 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1748 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1749
1750 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1751 Value *Base =
1752 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1753 // Store the pointer to the variable
1754 Value *Addr = Builder.CreateStructGEP(
1755 DependInfo, Base,
1756 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1757 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1758 Builder.CreateStore(DepValPtr, Addr);
1759 // Store the size of the variable
1760 Value *Size = Builder.CreateStructGEP(
1761 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1762 Builder.CreateStore(
1763 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1764 Size);
1765 // Store the dependency kind
1766 Value *Flags = Builder.CreateStructGEP(
1767 DependInfo, Base,
1768 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1769 Builder.CreateStore(
1770 ConstantInt::get(Builder.getInt8Ty(),
1771 static_cast<unsigned int>(Dep.DepKind)),
1772 Flags);
1773 }
1774 Builder.restoreIP(OldIP);
1775 return DepArray;
1776}
1777
1780 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1781 bool Tied, Value *Final, Value *IfCondition,
1782 SmallVector<DependData> Dependencies) {
1783
1784 if (!updateToLocation(Loc))
1785 return InsertPointTy();
1786
1787 uint32_t SrcLocStrSize;
1788 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1789 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1790 // The current basic block is split into four basic blocks. After outlining,
1791 // they will be mapped as follows:
1792 // ```
1793 // def current_fn() {
1794 // current_basic_block:
1795 // br label %task.exit
1796 // task.exit:
1797 // ; instructions after task
1798 // }
1799 // def outlined_fn() {
1800 // task.alloca:
1801 // br label %task.body
1802 // task.body:
1803 // ret void
1804 // }
1805 // ```
1806 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1807 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1808 BasicBlock *TaskAllocaBB =
1809 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1810
1811 InsertPointTy TaskAllocaIP =
1812 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1813 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1814 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1815
1816 OutlineInfo OI;
1817 OI.EntryBB = TaskAllocaBB;
1818 OI.OuterAllocaBB = AllocaIP.getBlock();
1819 OI.ExitBB = TaskExitBB;
1820
1821 // Add the thread ID argument.
1824 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1825
1826 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1827 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1828 // Replace the Stale CI by appropriate RTL function call.
1829 assert(OutlinedFn.getNumUses() == 1 &&
1830 "there must be a single user for the outlined function");
1831 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1832
1833 // HasShareds is true if any variables are captured in the outlined region,
1834 // false otherwise.
1835 bool HasShareds = StaleCI->arg_size() > 1;
1836 Builder.SetInsertPoint(StaleCI);
1837
1838 // Gather the arguments for emitting the runtime call for
1839 // @__kmpc_omp_task_alloc
1840 Function *TaskAllocFn =
1841 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1842
1843 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1844 // call.
1845 Value *ThreadID = getOrCreateThreadID(Ident);
1846
1847 // Argument - `flags`
1848 // Task is tied iff (Flags & 1) == 1.
1849 // Task is untied iff (Flags & 1) == 0.
1850 // Task is final iff (Flags & 2) == 2.
1851 // Task is not final iff (Flags & 2) == 0.
1852 // TODO: Handle the other flags.
1853 Value *Flags = Builder.getInt32(Tied);
1854 if (Final) {
1855 Value *FinalFlag =
1857 Flags = Builder.CreateOr(FinalFlag, Flags);
1858 }
1859
1860 // Argument - `sizeof_kmp_task_t` (TaskSize)
1861 // Tasksize refers to the size in bytes of kmp_task_t data structure
1862 // including private vars accessed in task.
1863 // TODO: add kmp_task_t_with_privates (privates)
1864 Value *TaskSize = Builder.getInt64(
1866
1867 // Argument - `sizeof_shareds` (SharedsSize)
1868 // SharedsSize refers to the shareds array size in the kmp_task_t data
1869 // structure.
1870 Value *SharedsSize = Builder.getInt64(0);
1871 if (HasShareds) {
1872 AllocaInst *ArgStructAlloca =
1873 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1874 assert(ArgStructAlloca &&
1875 "Unable to find the alloca instruction corresponding to arguments "
1876 "for extracted function");
1877 StructType *ArgStructType =
1878 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1879 assert(ArgStructType && "Unable to find struct type corresponding to "
1880 "arguments for extracted function");
1881 SharedsSize =
1883 }
1884 // Emit the @__kmpc_omp_task_alloc runtime call
1885 // The runtime call returns a pointer to an area where the task captured
1886 // variables must be copied before the task is run (TaskData)
1887 CallInst *TaskData = Builder.CreateCall(
1888 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1889 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1890 /*task_func=*/&OutlinedFn});
1891
1892 // Copy the arguments for outlined function
1893 if (HasShareds) {
1894 Value *Shareds = StaleCI->getArgOperand(1);
1895 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1896 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1897 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1898 SharedsSize);
1899 }
1900
1901 Value *DepArray = nullptr;
1902 if (Dependencies.size()) {
1903 InsertPointTy OldIP = Builder.saveIP();
1905 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1906
1907 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1908 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1909
1910 unsigned P = 0;
1911 for (const DependData &Dep : Dependencies) {
1912 Value *Base =
1913 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1914 // Store the pointer to the variable
1916 DependInfo, Base,
1917 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1918 Value *DepValPtr =
1920 Builder.CreateStore(DepValPtr, Addr);
1921 // Store the size of the variable
1923 DependInfo, Base,
1924 static_cast<unsigned int>(RTLDependInfoFields::Len));
1926 Dep.DepValueType)),
1927 Size);
1928 // Store the dependency kind
1930 DependInfo, Base,
1931 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1933 ConstantInt::get(Builder.getInt8Ty(),
1934 static_cast<unsigned int>(Dep.DepKind)),
1935 Flags);
1936 ++P;
1937 }
1938
1939 Builder.restoreIP(OldIP);
1940 }
1941
1942 // In the presence of the `if` clause, the following IR is generated:
1943 // ...
1944 // %data = call @__kmpc_omp_task_alloc(...)
1945 // br i1 %if_condition, label %then, label %else
1946 // then:
1947 // call @__kmpc_omp_task(...)
1948 // br label %exit
1949 // else:
1950 // ;; Wait for resolution of dependencies, if any, before
1951 // ;; beginning the task
1952 // call @__kmpc_omp_wait_deps(...)
1953 // call @__kmpc_omp_task_begin_if0(...)
1954 // call @outlined_fn(...)
1955 // call @__kmpc_omp_task_complete_if0(...)
1956 // br label %exit
1957 // exit:
1958 // ...
1959 if (IfCondition) {
1960 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1961 // terminator.
1962 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1963 Instruction *IfTerminator =
1964 Builder.GetInsertPoint()->getParent()->getTerminator();
1965 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1966 Builder.SetInsertPoint(IfTerminator);
1967 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1968 &ElseTI);
1969 Builder.SetInsertPoint(ElseTI);
1970
1971 if (Dependencies.size()) {
1972 Function *TaskWaitFn =
1973 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1975 TaskWaitFn,
1976 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1977 ConstantInt::get(Builder.getInt32Ty(), 0),
1979 }
1980 Function *TaskBeginFn =
1981 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1982 Function *TaskCompleteFn =
1983 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1984 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1985 CallInst *CI = nullptr;
1986 if (HasShareds)
1987 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1988 else
1989 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1990 CI->setDebugLoc(StaleCI->getDebugLoc());
1991 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1992 Builder.SetInsertPoint(ThenTI);
1993 }
1994
1995 if (Dependencies.size()) {
1996 Function *TaskFn =
1997 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
1999 TaskFn,
2000 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2001 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2003
2004 } else {
2005 // Emit the @__kmpc_omp_task runtime call to spawn the task
2006 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2007 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2008 }
2009
2010 StaleCI->eraseFromParent();
2011
2012 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2013 if (HasShareds) {
2014 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2015 OutlinedFn.getArg(1)->replaceUsesWithIf(
2016 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2017 }
2018
2019 llvm::for_each(llvm::reverse(ToBeDeleted),
2020 [](Instruction *I) { I->eraseFromParent(); });
2021 };
2022
2023 addOutlineInfo(std::move(OI));
2024 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2025
2026 return Builder.saveIP();
2027}
2028
2031 InsertPointTy AllocaIP,
2032 BodyGenCallbackTy BodyGenCB) {
2033 if (!updateToLocation(Loc))
2034 return InsertPointTy();
2035
2036 uint32_t SrcLocStrSize;
2037 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2038 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2039 Value *ThreadID = getOrCreateThreadID(Ident);
2040
2041 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2042 Function *TaskgroupFn =
2043 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2044 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2045
2046 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2047 BodyGenCB(AllocaIP, Builder.saveIP());
2048
2049 Builder.SetInsertPoint(TaskgroupExitBB);
2050 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2051 Function *EndTaskgroupFn =
2052 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2053 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2054
2055 return Builder.saveIP();
2056}
2057
2059 const LocationDescription &Loc, InsertPointTy AllocaIP,
2061 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2062 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2063
2064 if (!updateToLocation(Loc))
2065 return Loc.IP;
2066
2067 auto FiniCBWrapper = [&](InsertPointTy IP) {
2068 if (IP.getBlock()->end() != IP.getPoint())
2069 return FiniCB(IP);
2070 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2071 // will fail because that function requires the Finalization Basic Block to
2072 // have a terminator, which is already removed by EmitOMPRegionBody.
2073 // IP is currently at cancelation block.
2074 // We need to backtrack to the condition block to fetch
2075 // the exit block and create a branch from cancelation
2076 // to exit block.
2078 Builder.restoreIP(IP);
2079 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2080 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2081 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2082 Instruction *I = Builder.CreateBr(ExitBB);
2083 IP = InsertPointTy(I->getParent(), I->getIterator());
2084 return FiniCB(IP);
2085 };
2086
2087 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2088
2089 // Each section is emitted as a switch case
2090 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2091 // -> OMP.createSection() which generates the IR for each section
2092 // Iterate through all sections and emit a switch construct:
2093 // switch (IV) {
2094 // case 0:
2095 // <SectionStmt[0]>;
2096 // break;
2097 // ...
2098 // case <NumSection> - 1:
2099 // <SectionStmt[<NumSection> - 1]>;
2100 // break;
2101 // }
2102 // ...
2103 // section_loop.after:
2104 // <FiniCB>;
2105 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2106 Builder.restoreIP(CodeGenIP);
2108 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2109 Function *CurFn = Continue->getParent();
2110 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2111
2112 unsigned CaseNumber = 0;
2113 for (auto SectionCB : SectionCBs) {
2115 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2116 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2117 Builder.SetInsertPoint(CaseBB);
2118 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2119 SectionCB(InsertPointTy(),
2120 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2121 CaseNumber++;
2122 }
2123 // remove the existing terminator from body BB since there can be no
2124 // terminators after switch/case
2125 };
2126 // Loop body ends here
2127 // LowerBound, UpperBound, and STride for createCanonicalLoop
2128 Type *I32Ty = Type::getInt32Ty(M.getContext());
2129 Value *LB = ConstantInt::get(I32Ty, 0);
2130 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2131 Value *ST = ConstantInt::get(I32Ty, 1);
2133 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2134 InsertPointTy AfterIP =
2135 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2136
2137 // Apply the finalization callback in LoopAfterBB
2138 auto FiniInfo = FinalizationStack.pop_back_val();
2139 assert(FiniInfo.DK == OMPD_sections &&
2140 "Unexpected finalization stack state!");
2141 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2142 Builder.restoreIP(AfterIP);
2143 BasicBlock *FiniBB =
2144 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2145 CB(Builder.saveIP());
2146 AfterIP = {FiniBB, FiniBB->begin()};
2147 }
2148
2149 return AfterIP;
2150}
2151
2154 BodyGenCallbackTy BodyGenCB,
2155 FinalizeCallbackTy FiniCB) {
2156 if (!updateToLocation(Loc))
2157 return Loc.IP;
2158
2159 auto FiniCBWrapper = [&](InsertPointTy IP) {
2160 if (IP.getBlock()->end() != IP.getPoint())
2161 return FiniCB(IP);
2162 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2163 // will fail because that function requires the Finalization Basic Block to
2164 // have a terminator, which is already removed by EmitOMPRegionBody.
2165 // IP is currently at cancelation block.
2166 // We need to backtrack to the condition block to fetch
2167 // the exit block and create a branch from cancelation
2168 // to exit block.
2170 Builder.restoreIP(IP);
2171 auto *CaseBB = Loc.IP.getBlock();
2172 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2173 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2174 Instruction *I = Builder.CreateBr(ExitBB);
2175 IP = InsertPointTy(I->getParent(), I->getIterator());
2176 return FiniCB(IP);
2177 };
2178
2179 Directive OMPD = Directive::OMPD_sections;
2180 // Since we are using Finalization Callback here, HasFinalize
2181 // and IsCancellable have to be true
2182 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2183 /*Conditional*/ false, /*hasFinalize*/ true,
2184 /*IsCancellable*/ true);
2185}
2186
2189 IT++;
2190 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2191}
2192
2193void OpenMPIRBuilder::emitUsed(StringRef Name,
2194 std::vector<WeakTrackingVH> &List) {
2195 if (List.empty())
2196 return;
2197
2198 // Convert List to what ConstantArray needs.
2200 UsedArray.resize(List.size());
2201 for (unsigned I = 0, E = List.size(); I != E; ++I)
2203 cast<Constant>(&*List[I]), Builder.getPtrTy());
2204
2205 if (UsedArray.empty())
2206 return;
2207 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2208
2209 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2210 ConstantArray::get(ATy, UsedArray), Name);
2211
2212 GV->setSection("llvm.metadata");
2213}
2214
2215Value *OpenMPIRBuilder::getGPUThreadID() {
2216 return Builder.CreateCall(
2218 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2219 {});
2220}
2221
2222Value *OpenMPIRBuilder::getGPUWarpSize() {
2223 return Builder.CreateCall(
2224 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2225}
2226
2227Value *OpenMPIRBuilder::getNVPTXWarpID() {
2228 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2229 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2230}
2231
2232Value *OpenMPIRBuilder::getNVPTXLaneID() {
2233 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2234 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2235 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2236 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2237 "nvptx_lane_id");
2238}
2239
2240Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2241 Type *ToType) {
2242 Type *FromType = From->getType();
2243 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2244 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2245 assert(FromSize > 0 && "From size must be greater than zero");
2246 assert(ToSize > 0 && "To size must be greater than zero");
2247 if (FromType == ToType)
2248 return From;
2249 if (FromSize == ToSize)
2250 return Builder.CreateBitCast(From, ToType);
2251 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2252 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2253 InsertPointTy SaveIP = Builder.saveIP();
2254 Builder.restoreIP(AllocaIP);
2255 Value *CastItem = Builder.CreateAlloca(ToType);
2256 Builder.restoreIP(SaveIP);
2257
2259 CastItem, FromType->getPointerTo());
2260 Builder.CreateStore(From, ValCastItem);
2261 return Builder.CreateLoad(ToType, CastItem);
2262}
2263
2264Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2265 Value *Element,
2266 Type *ElementType,
2267 Value *Offset) {
2268 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2269 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2270
2271 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2272 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2273 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2274 Value *WarpSize =
2275 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2277 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2278 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2279 Value *WarpSizeCast =
2280 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2281 Value *ShuffleCall =
2282 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2283 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2284}
2285
2286void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2287 Value *DstAddr, Type *ElemType,
2288 Value *Offset, Type *ReductionArrayTy) {
2290 // Create the loop over the big sized data.
2291 // ptr = (void*)Elem;
2292 // ptrEnd = (void*) Elem + 1;
2293 // Step = 8;
2294 // while (ptr + Step < ptrEnd)
2295 // shuffle((int64_t)*ptr);
2296 // Step = 4;
2297 // while (ptr + Step < ptrEnd)
2298 // shuffle((int32_t)*ptr);
2299 // ...
2300 Type *IndexTy = Builder.getIndexTy(
2302 Value *ElemPtr = DstAddr;
2303 Value *Ptr = SrcAddr;
2304 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2305 if (Size < IntSize)
2306 continue;
2307 Type *IntType = Builder.getIntNTy(IntSize * 8);
2309 Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
2310 Value *SrcAddrGEP =
2311 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2313 ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
2314
2315 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2316 if ((Size / IntSize) > 1) {
2318 SrcAddrGEP, Builder.getPtrTy());
2319 BasicBlock *PreCondBB =
2320 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2321 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2322 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2323 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2324 emitBlock(PreCondBB, CurFunc);
2325 PHINode *PhiSrc =
2326 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2327 PhiSrc->addIncoming(Ptr, CurrentBB);
2328 PHINode *PhiDest =
2329 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2330 PhiDest->addIncoming(ElemPtr, CurrentBB);
2331 Ptr = PhiSrc;
2332 ElemPtr = PhiDest;
2333 Value *PtrDiff = Builder.CreatePtrDiff(
2334 Builder.getInt8Ty(), PtrEnd,
2337 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2338 ExitBB);
2339 emitBlock(ThenBB, CurFunc);
2340 Value *Res = createRuntimeShuffleFunction(
2341 AllocaIP,
2343 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2344 IntType, Offset);
2345 Builder.CreateAlignedStore(Res, ElemPtr,
2346 M.getDataLayout().getPrefTypeAlign(ElemType));
2347 Value *LocalPtr =
2348 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2349 Value *LocalElemPtr =
2350 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2351 PhiSrc->addIncoming(LocalPtr, ThenBB);
2352 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2353 emitBranch(PreCondBB);
2354 emitBlock(ExitBB, CurFunc);
2355 } else {
2356 Value *Res = createRuntimeShuffleFunction(
2357 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2358 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2359 Res->getType()->getScalarSizeInBits())
2360 Res = Builder.CreateTrunc(Res, ElemType);
2361 Builder.CreateStore(Res, ElemPtr);
2362 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2363 ElemPtr =
2364 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2365 }
2366 Size = Size % IntSize;
2367 }
2368}
2369
2370void OpenMPIRBuilder::emitReductionListCopy(
2371 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2372 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2373 CopyOptionsTy CopyOptions) {
2374 Type *IndexTy = Builder.getIndexTy(
2376 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2377
2378 // Iterates, element-by-element, through the source Reduce list and
2379 // make a copy.
2380 for (auto En : enumerate(ReductionInfos)) {
2381 const ReductionInfo &RI = En.value();
2382 Value *SrcElementAddr = nullptr;
2383 Value *DestElementAddr = nullptr;
2384 Value *DestElementPtrAddr = nullptr;
2385 // Should we shuffle in an element from a remote lane?
2386 bool ShuffleInElement = false;
2387 // Set to true to update the pointer in the dest Reduce list to a
2388 // newly created element.
2389 bool UpdateDestListPtr = false;
2390
2391 // Step 1.1: Get the address for the src element in the Reduce list.
2392 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2393 ReductionArrayTy, SrcBase,
2394 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2395 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2396
2397 // Step 1.2: Create a temporary to store the element in the destination
2398 // Reduce list.
2399 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2400 ReductionArrayTy, DestBase,
2401 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2402 switch (Action) {
2404 InsertPointTy CurIP = Builder.saveIP();
2405 Builder.restoreIP(AllocaIP);
2406 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2407 ".omp.reduction.element");
2408 DestAlloca->setAlignment(
2409 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2410 DestElementAddr = DestAlloca;
2411 DestElementAddr =
2412 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2413 DestElementAddr->getName() + ".ascast");
2414 Builder.restoreIP(CurIP);
2415 ShuffleInElement = true;
2416 UpdateDestListPtr = true;
2417 break;
2418 }
2420 DestElementAddr =
2421 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2422 break;
2423 }
2424 }
2425
2426 // Now that all active lanes have read the element in the
2427 // Reduce list, shuffle over the value from the remote lane.
2428 if (ShuffleInElement) {
2429 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2430 RemoteLaneOffset, ReductionArrayTy);
2431 } else {
2432 switch (RI.EvaluationKind) {
2433 case EvalKind::Scalar: {
2434 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2435 // Store the source element value to the dest element address.
2436 Builder.CreateStore(Elem, DestElementAddr);
2437 break;
2438 }
2439 case EvalKind::Complex: {
2441 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2442 Value *SrcReal = Builder.CreateLoad(
2443 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2445 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2446 Value *SrcImg = Builder.CreateLoad(
2447 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2448
2450 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2452 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2453 Builder.CreateStore(SrcReal, DestRealPtr);
2454 Builder.CreateStore(SrcImg, DestImgPtr);
2455 break;
2456 }
2457 case EvalKind::Aggregate: {
2458 Value *SizeVal = Builder.getInt64(
2459 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2461 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2462 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2463 SizeVal, false);
2464 break;
2465 }
2466 };
2467 }
2468
2469 // Step 3.1: Modify reference in dest Reduce list as needed.
2470 // Modifying the reference in Reduce list to point to the newly
2471 // created element. The element is live in the current function
2472 // scope and that of functions it invokes (i.e., reduce_function).
2473 // RemoteReduceData[i] = (void*)&RemoteElem
2474 if (UpdateDestListPtr) {
2476 DestElementAddr, Builder.getPtrTy(),
2477 DestElementAddr->getName() + ".ascast");
2478 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2479 }
2480 }
2481}
2482
2483Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
2484 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2485 AttributeList FuncAttrs) {
2486 InsertPointTy SavedIP = Builder.saveIP();
2487 LLVMContext &Ctx = M.getContext();
2489 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2490 /* IsVarArg */ false);
2491 Function *WcFunc =
2493 "_omp_reduction_inter_warp_copy_func", &M);
2494 WcFunc->setAttributes(FuncAttrs);
2495 WcFunc->addParamAttr(0, Attribute::NoUndef);
2496 WcFunc->addParamAttr(1, Attribute::NoUndef);
2497 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2498 Builder.SetInsertPoint(EntryBB);
2499
2500 // ReduceList: thread local Reduce list.
2501 // At the stage of the computation when this function is called, partially
2502 // aggregated values reside in the first lane of every active warp.
2503 Argument *ReduceListArg = WcFunc->getArg(0);
2504 // NumWarps: number of warps active in the parallel region. This could
2505 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2506 Argument *NumWarpsArg = WcFunc->getArg(1);
2507
2508 // This array is used as a medium to transfer, one reduce element at a time,
2509 // the data from the first lane of every warp to lanes in the first warp
2510 // in order to perform the final step of a reduction in a parallel region
2511 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2512 // for reduced latency, as well as to have a distinct copy for concurrently
2513 // executing target regions. The array is declared with common linkage so
2514 // as to be shared across compilation units.
2515 StringRef TransferMediumName =
2516 "__openmp_nvptx_data_transfer_temporary_storage";
2517 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2518 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2519 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2520 if (!TransferMedium) {
2521 TransferMedium = new GlobalVariable(
2522 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2523 UndefValue::get(ArrayTy), TransferMediumName,
2524 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2525 /*AddressSpace=*/3);
2526 }
2527
2528 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2529 Value *GPUThreadID = getGPUThreadID();
2530 // nvptx_lane_id = nvptx_id % warpsize
2531 Value *LaneID = getNVPTXLaneID();
2532 // nvptx_warp_id = nvptx_id / warpsize
2533 Value *WarpID = getNVPTXWarpID();
2534
2535 InsertPointTy AllocaIP =
2538 Type *Arg0Type = ReduceListArg->getType();
2539 Type *Arg1Type = NumWarpsArg->getType();
2540 Builder.restoreIP(AllocaIP);
2541 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2542 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2543 AllocaInst *NumWarpsAlloca =
2544 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2546 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2548 NumWarpsAlloca, Arg1Type->getPointerTo(),
2549 NumWarpsAlloca->getName() + ".ascast");
2550 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2551 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2552 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2553 InsertPointTy CodeGenIP =
2555 Builder.restoreIP(CodeGenIP);
2556
2557 Value *ReduceList =
2558 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2559
2560 for (auto En : enumerate(ReductionInfos)) {
2561 //
2562 // Warp master copies reduce element to transfer medium in __shared__
2563 // memory.
2564 //
2565 const ReductionInfo &RI = En.value();
2566 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2567 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2568 Type *CType = Builder.getIntNTy(TySize * 8);
2569
2570 unsigned NumIters = RealTySize / TySize;
2571 if (NumIters == 0)
2572 continue;
2573 Value *Cnt = nullptr;
2574 Value *CntAddr = nullptr;
2575 BasicBlock *PrecondBB = nullptr;
2576 BasicBlock *ExitBB = nullptr;
2577 if (NumIters > 1) {
2578 CodeGenIP = Builder.saveIP();
2579 Builder.restoreIP(AllocaIP);
2580 CntAddr =
2581 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2582
2583 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2584 CntAddr->getName() + ".ascast");
2585 Builder.restoreIP(CodeGenIP);
2587 CntAddr,
2588 /*Volatile=*/false);
2589 PrecondBB = BasicBlock::Create(Ctx, "precond");
2590 ExitBB = BasicBlock::Create(Ctx, "exit");
2591 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2592 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2593 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2594 /*Volatile=*/false);
2596 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2597 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2599 }
2600
2601 // kmpc_barrier.
2602 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2603 omp::Directive::OMPD_unknown,
2604 /* ForceSimpleCall */ false,
2605 /* CheckCancelFlag */ true);
2606 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2607 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2608 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2609
2610 // if (lane_id == 0)
2611 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2612 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2614
2615 // Reduce element = LocalReduceList[i]
2616 auto *RedListArrayTy =
2617 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2618 Type *IndexTy = Builder.getIndexTy(
2620 Value *ElemPtrPtr =
2621 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2622 {ConstantInt::get(IndexTy, 0),
2623 ConstantInt::get(IndexTy, En.index())});
2624 // elemptr = ((CopyType*)(elemptrptr)) + I
2625 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2626 if (NumIters > 1)
2627 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2628
2629 // Get pointer to location in transfer medium.
2630 // MediumPtr = &medium[warp_id]
2631 Value *MediumPtr = Builder.CreateInBoundsGEP(
2632 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2633 // elem = *elemptr
2634 //*MediumPtr = elem
2635 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2636 // Store the source element value to the dest element address.
2637 Builder.CreateStore(Elem, MediumPtr,
2638 /*IsVolatile*/ true);
2639 Builder.CreateBr(MergeBB);
2640
2641 // else
2643 Builder.CreateBr(MergeBB);
2644
2645 // endif
2647 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2648 omp::Directive::OMPD_unknown,
2649 /* ForceSimpleCall */ false,
2650 /* CheckCancelFlag */ true);
2651
2652 // Warp 0 copies reduce element from transfer medium
2653 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2654 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2655 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2656
2657 Value *NumWarpsVal =
2658 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2659 // Up to 32 threads in warp 0 are active.
2660 Value *IsActiveThread =
2661 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2662 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2663
2664 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2665
2666 // SecMediumPtr = &medium[tid]
2667 // SrcMediumVal = *SrcMediumPtr
2668 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2669 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2670 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2671 Value *TargetElemPtrPtr =
2672 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2673 {ConstantInt::get(IndexTy, 0),
2674 ConstantInt::get(IndexTy, En.index())});
2675 Value *TargetElemPtrVal =
2676 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2677 Value *TargetElemPtr = TargetElemPtrVal;
2678 if (NumIters > 1)
2679 TargetElemPtr =
2680 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2681
2682 // *TargetElemPtr = SrcMediumVal;
2683 Value *SrcMediumValue =
2684 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2685 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2686 Builder.CreateBr(W0MergeBB);
2687
2688 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2689 Builder.CreateBr(W0MergeBB);
2690
2691 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2692
2693 if (NumIters > 1) {
2694 Cnt = Builder.CreateNSWAdd(
2695 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2696 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2697
2698 auto *CurFn = Builder.GetInsertBlock()->getParent();
2699 emitBranch(PrecondBB);
2700 emitBlock(ExitBB, CurFn);
2701 }
2702 RealTySize %= TySize;
2703 }
2704 }
2705
2707 Builder.restoreIP(SavedIP);
2708
2709 return WcFunc;
2710}
2711
2712Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2713 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2714 AttributeList FuncAttrs) {
2715 LLVMContext &Ctx = M.getContext();
2716 FunctionType *FuncTy =
2718 {Builder.getPtrTy(), Builder.getInt16Ty(),
2719 Builder.getInt16Ty(), Builder.getInt16Ty()},
2720 /* IsVarArg */ false);
2721 Function *SarFunc =
2723 "_omp_reduction_shuffle_and_reduce_func", &M);
2724 SarFunc->setAttributes(FuncAttrs);
2725 SarFunc->addParamAttr(0, Attribute::NoUndef);
2726 SarFunc->addParamAttr(1, Attribute::NoUndef);
2727 SarFunc->addParamAttr(2, Attribute::NoUndef);
2728 SarFunc->addParamAttr(3, Attribute::NoUndef);
2729 SarFunc->addParamAttr(1, Attribute::SExt);
2730 SarFunc->addParamAttr(2, Attribute::SExt);
2731 SarFunc->addParamAttr(3, Attribute::SExt);
2732 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2733 Builder.SetInsertPoint(EntryBB);
2734
2735 // Thread local Reduce list used to host the values of data to be reduced.
2736 Argument *ReduceListArg = SarFunc->getArg(0);
2737 // Current lane id; could be logical.
2738 Argument *LaneIDArg = SarFunc->getArg(1);
2739 // Offset of the remote source lane relative to the current lane.
2740 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2741 // Algorithm version. This is expected to be known at compile time.
2742 Argument *AlgoVerArg = SarFunc->getArg(3);
2743
2744 Type *ReduceListArgType = ReduceListArg->getType();
2745 Type *LaneIDArgType = LaneIDArg->getType();
2746 Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
2747 Value *ReduceListAlloca = Builder.CreateAlloca(
2748 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2749 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2750 LaneIDArg->getName() + ".addr");
2751 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2752 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2753 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2754 AlgoVerArg->getName() + ".addr");
2755 ArrayType *RedListArrayTy =
2756 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2757
2758 // Create a local thread-private variable to host the Reduce list
2759 // from a remote lane.
2760 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2761 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2762
2764 ReduceListAlloca, ReduceListArgType,
2765 ReduceListAlloca->getName() + ".ascast");
2767 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2768 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2769 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2770 RemoteLaneOffsetAlloca->getName() + ".ascast");
2772 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2774 RemoteReductionListAlloca, Builder.getPtrTy(),
2775 RemoteReductionListAlloca->getName() + ".ascast");
2776
2777 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2778 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2779 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2780 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2781
2782 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2783 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2784 Value *RemoteLaneOffset =
2785 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2786 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2787
2788 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2789
2790 // This loop iterates through the list of reduce elements and copies,
2791 // element by element, from a remote lane in the warp to RemoteReduceList,
2792 // hosted on the thread's stack.
2793 emitReductionListCopy(
2794 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2795 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2796
2797 // The actions to be performed on the Remote Reduce list is dependent
2798 // on the algorithm version.
2799 //
2800 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2801 // LaneId % 2 == 0 && Offset > 0):
2802 // do the reduction value aggregation
2803 //
2804 // The thread local variable Reduce list is mutated in place to host the
2805 // reduced data, which is the aggregated value produced from local and
2806 // remote lanes.
2807 //
2808 // Note that AlgoVer is expected to be a constant integer known at compile
2809 // time.
2810 // When AlgoVer==0, the first conjunction evaluates to true, making
2811 // the entire predicate true during compile time.
2812 // When AlgoVer==1, the second conjunction has only the second part to be
2813 // evaluated during runtime. Other conjunctions evaluates to false
2814 // during compile time.
2815 // When AlgoVer==2, the third conjunction has only the second part to be
2816 // evaluated during runtime. Other conjunctions evaluates to false
2817 // during compile time.
2818 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2819 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2820 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2821 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2822 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2823 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2824 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2825 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2826 Value *RemoteOffsetComp =
2827 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2828 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2829 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2830 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2831
2832 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2833 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2834 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2835
2836 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2839 ReduceList, Builder.getPtrTy());
2840 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2841 RemoteListAddrCast, Builder.getPtrTy());
2842 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2843 ->addFnAttr(Attribute::NoUnwind);
2844 Builder.CreateBr(MergeBB);
2845
2847 Builder.CreateBr(MergeBB);
2848
2850
2851 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2852 // Reduce list.
2853 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2854 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2855 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2856
2857 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2858 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2859 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2860 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2861
2862 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2863 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2864 ReductionInfos, RemoteListAddrCast, ReduceList);
2865 Builder.CreateBr(CpyMergeBB);
2866
2867 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2868 Builder.CreateBr(CpyMergeBB);
2869
2870 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2871
2873
2874 return SarFunc;
2875}
2876
2877Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2878 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2879 AttributeList FuncAttrs) {
2881 LLVMContext &Ctx = M.getContext();
2884 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2885 /* IsVarArg */ false);
2886 Function *LtGCFunc =
2888 "_omp_reduction_list_to_global_copy_func", &M);
2889 LtGCFunc->setAttributes(FuncAttrs);
2890 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2891 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2892 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2893
2894 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2895 Builder.SetInsertPoint(EntryBlock);
2896
2897 // Buffer: global reduction buffer.
2898 Argument *BufferArg = LtGCFunc->getArg(0);
2899 // Idx: index of the buffer.
2900 Argument *IdxArg = LtGCFunc->getArg(1);
2901 // ReduceList: thread local Reduce list.
2902 Argument *ReduceListArg = LtGCFunc->getArg(2);
2903
2904 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2905 BufferArg->getName() + ".addr");
2906 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2907 IdxArg->getName() + ".addr");
2908 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2909 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2911 BufferArgAlloca, Builder.getPtrTy(),
2912 BufferArgAlloca->getName() + ".ascast");
2914 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2915 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2916 ReduceListArgAlloca, Builder.getPtrTy(),
2917 ReduceListArgAlloca->getName() + ".ascast");
2918
2919 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2920 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2921 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2922
2923 Value *LocalReduceList =
2924 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2925 Value *BufferArgVal =
2926 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2927 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2928 Type *IndexTy = Builder.getIndexTy(
2930 for (auto En : enumerate(ReductionInfos)) {
2931 const ReductionInfo &RI = En.value();
2932 auto *RedListArrayTy =
2933 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2934 // Reduce element = LocalReduceList[i]
2935 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
2936 RedListArrayTy, LocalReduceList,
2937 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2938 // elemptr = ((CopyType*)(elemptrptr)) + I
2939 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2940
2941 // Global = Buffer.VD[Idx];
2942 Value *BufferVD =
2943 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
2945 ReductionsBufferTy, BufferVD, 0, En.index());
2946
2947 switch (RI.EvaluationKind) {
2948 case EvalKind::Scalar: {
2949 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
2950 Builder.CreateStore(TargetElement, GlobVal);
2951 break;
2952 }
2953 case EvalKind::Complex: {
2955 RI.ElementType, ElemPtr, 0, 0, ".realp");
2956 Value *SrcReal = Builder.CreateLoad(
2957 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2959 RI.ElementType, ElemPtr, 0, 1, ".imagp");
2960 Value *SrcImg = Builder.CreateLoad(
2961 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2962
2964 RI.ElementType, GlobVal, 0, 0, ".realp");
2966 RI.ElementType, GlobVal, 0, 1, ".imagp");
2967 Builder.CreateStore(SrcReal, DestRealPtr);
2968 Builder.CreateStore(SrcImg, DestImgPtr);
2969 break;
2970 }
2971 case EvalKind::Aggregate: {
2972 Value *SizeVal =
2973 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
2975 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
2976 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
2977 break;
2978 }
2979 }
2980 }
2981
2983 Builder.restoreIP(OldIP);
2984 return LtGCFunc;
2985}
2986
2987Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
2988 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2989 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
2991 LLVMContext &Ctx = M.getContext();
2994 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2995 /* IsVarArg */ false);
2996 Function *LtGRFunc =
2998 "_omp_reduction_list_to_global_reduce_func", &M);
2999 LtGRFunc->setAttributes(FuncAttrs);
3000 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3001 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3002 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3003
3004 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3005 Builder.SetInsertPoint(EntryBlock);
3006
3007 // Buffer: global reduction buffer.
3008 Argument *BufferArg = LtGRFunc->getArg(0);
3009 // Idx: index of the buffer.
3010 Argument *IdxArg = LtGRFunc->getArg(1);
3011 // ReduceList: thread local Reduce list.
3012 Argument *ReduceListArg = LtGRFunc->getArg(2);
3013
3014 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3015 BufferArg->getName() + ".addr");
3016 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3017 IdxArg->getName() + ".addr");
3018 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3019 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3020 auto *RedListArrayTy =
3021 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3022
3023 // 1. Build a list of reduction variables.
3024 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3025 Value *LocalReduceList =
3026 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3027
3029 BufferArgAlloca, Builder.getPtrTy(),
3030 BufferArgAlloca->getName() + ".ascast");
3032 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3033 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3034 ReduceListArgAlloca, Builder.getPtrTy(),
3035 ReduceListArgAlloca->getName() + ".ascast");
3036 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3037 LocalReduceList, Builder.getPtrTy(),
3038 LocalReduceList->getName() + ".ascast");
3039
3040 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3041 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3042 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3043
3044 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3045 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3046 Type *IndexTy = Builder.getIndexTy(
3048 for (auto En : enumerate(ReductionInfos)) {
3049 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3050 RedListArrayTy, LocalReduceListAddrCast,
3051 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3052 Value *BufferVD =
3053 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3054 // Global = Buffer.VD[Idx];
3056 ReductionsBufferTy, BufferVD, 0, En.index());
3057 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3058 }
3059
3060 // Call reduce_function(GlobalReduceList, ReduceList)
3061 Value *ReduceList =
3062 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3063 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3064 ->addFnAttr(Attribute::NoUnwind);
3066 Builder.restoreIP(OldIP);
3067 return LtGRFunc;
3068}
3069
3070Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3071 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3072 AttributeList FuncAttrs) {
3074 LLVMContext &Ctx = M.getContext();
3077 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3078 /* IsVarArg */ false);
3079 Function *LtGCFunc =
3081 "_omp_reduction_global_to_list_copy_func", &M);
3082 LtGCFunc->setAttributes(FuncAttrs);
3083 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3084 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3085 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3086
3087 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3088 Builder.SetInsertPoint(EntryBlock);
3089
3090 // Buffer: global reduction buffer.
3091 Argument *BufferArg = LtGCFunc->getArg(0);
3092 // Idx: index of the buffer.
3093 Argument *IdxArg = LtGCFunc->getArg(1);
3094 // ReduceList: thread local Reduce list.
3095 Argument *ReduceListArg = LtGCFunc->getArg(2);
3096
3097 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3098 BufferArg->getName() + ".addr");
3099 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3100 IdxArg->getName() + ".addr");
3101 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3102 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3104 BufferArgAlloca, Builder.getPtrTy(),
3105 BufferArgAlloca->getName() + ".ascast");
3107 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3108 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3109 ReduceListArgAlloca, Builder.getPtrTy(),
3110 ReduceListArgAlloca->getName() + ".ascast");
3111 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3112 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3113 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3114
3115 Value *LocalReduceList =
3116 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3117 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3118 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3119 Type *IndexTy = Builder.getIndexTy(
3121 for (auto En : enumerate(ReductionInfos)) {
3122 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3123 auto *RedListArrayTy =
3124 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3125 // Reduce element = LocalReduceList[i]
3126 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3127 RedListArrayTy, LocalReduceList,
3128 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3129 // elemptr = ((CopyType*)(elemptrptr)) + I
3130 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3131 // Global = Buffer.VD[Idx];
3132 Value *BufferVD =
3133 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3135 ReductionsBufferTy, BufferVD, 0, En.index());
3136
3137 switch (RI.EvaluationKind) {
3138 case EvalKind::Scalar: {
3139 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3140 Builder.CreateStore(TargetElement, ElemPtr);
3141 break;
3142 }
3143 case EvalKind::Complex: {
3145 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3146 Value *SrcReal = Builder.CreateLoad(
3147 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3149 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3150 Value *SrcImg = Builder.CreateLoad(
3151 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3152
3154 RI.ElementType, ElemPtr, 0, 0, ".realp");
3156 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3157 Builder.CreateStore(SrcReal, DestRealPtr);
3158 Builder.CreateStore(SrcImg, DestImgPtr);
3159 break;
3160 }
3161 case EvalKind::Aggregate: {
3162 Value *SizeVal =
3166 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3167 SizeVal, false);
3168 break;
3169 }
3170 }
3171 }
3172
3174 Builder.restoreIP(OldIP);
3175 return LtGCFunc;
3176}
3177
3178Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3179 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3180 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3182 LLVMContext &Ctx = M.getContext();
3183 auto *FuncTy = FunctionType::get(
3185 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3186 /* IsVarArg */ false);
3187 Function *LtGRFunc =
3189 "_omp_reduction_global_to_list_reduce_func", &M);
3190 LtGRFunc->setAttributes(FuncAttrs);
3191 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3192 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3193 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3194
3195 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3196 Builder.SetInsertPoint(EntryBlock);
3197
3198 // Buffer: global reduction buffer.
3199 Argument *BufferArg = LtGRFunc->getArg(0);
3200 // Idx: index of the buffer.
3201 Argument *IdxArg = LtGRFunc->getArg(1);
3202 // ReduceList: thread local Reduce list.
3203 Argument *ReduceListArg = LtGRFunc->getArg(2);
3204
3205 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3206 BufferArg->getName() + ".addr");
3207 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3208 IdxArg->getName() + ".addr");
3209 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3210 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3211 ArrayType *RedListArrayTy =
3212 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3213
3214 // 1. Build a list of reduction variables.
3215 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3216 Value *LocalReduceList =
3217 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3218
3220 BufferArgAlloca, Builder.getPtrTy(),
3221 BufferArgAlloca->getName() + ".ascast");
3223 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3224 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3225 ReduceListArgAlloca, Builder.getPtrTy(),
3226 ReduceListArgAlloca->getName() + ".ascast");
3228 LocalReduceList, Builder.getPtrTy(),
3229 LocalReduceList->getName() + ".ascast");
3230
3231 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3232 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3233 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3234
3235 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3236 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3237 Type *IndexTy = Builder.getIndexTy(
3239 for (auto En : enumerate(ReductionInfos)) {
3240 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3241 RedListArrayTy, ReductionList,
3242 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3243 // Global = Buffer.VD[Idx];
3244 Value *BufferVD =
3245 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3247 ReductionsBufferTy, BufferVD, 0, En.index());
3248 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3249 }
3250
3251 // Call reduce_function(ReduceList, GlobalReduceList)
3252 Value *ReduceList =
3253 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3254 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3255 ->addFnAttr(Attribute::NoUnwind);
3257 Builder.restoreIP(OldIP);
3258 return LtGRFunc;
3259}
3260
3261std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3262 std::string Suffix =
3263 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3264 return (Name + Suffix).str();
3265}
3266
3267Function *OpenMPIRBuilder::createReductionFunction(
3268 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3269 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3270 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3271 {Builder.getPtrTy(), Builder.getPtrTy()},
3272 /* IsVarArg */ false);
3273 std::string Name = getReductionFuncName(ReducerName);
3274 Function *ReductionFunc =
3276 ReductionFunc->setAttributes(FuncAttrs);
3277 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3278 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3279 BasicBlock *EntryBB =
3280 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3281 Builder.SetInsertPoint(EntryBB);
3282
3283 // Need to alloca memory here and deal with the pointers before getting
3284 // LHS/RHS pointers out
3285 Value *LHSArrayPtr = nullptr;
3286 Value *RHSArrayPtr = nullptr;
3287 Argument *Arg0 = ReductionFunc->getArg(0);
3288 Argument *Arg1 = ReductionFunc->getArg(1);
3289 Type *Arg0Type = Arg0->getType();
3290 Type *Arg1Type = Arg1->getType();
3291
3292 Value *LHSAlloca =
3293 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3294 Value *RHSAlloca =
3295 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3297 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3299 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3300 Builder.CreateStore(Arg0, LHSAddrCast);
3301 Builder.CreateStore(Arg1, RHSAddrCast);
3302 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3303 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3304
3305 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3306 Type *IndexTy = Builder.getIndexTy(
3308 SmallVector<Value *> LHSPtrs, RHSPtrs;
3309 for (auto En : enumerate(ReductionInfos)) {
3310 const ReductionInfo &RI = En.value();
3311 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3312 RedArrayTy, RHSArrayPtr,
3313 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3314 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3316 RHSI8Ptr, RI.PrivateVariable->getType(),
3317 RHSI8Ptr->getName() + ".ascast");
3318
3319 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3320 RedArrayTy, LHSArrayPtr,
3321 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3322 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3324 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3325
3327 LHSPtrs.emplace_back(LHSPtr);
3328 RHSPtrs.emplace_back(RHSPtr);
3329 } else {
3330 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3331 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3332 Value *Reduced;
3333 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3334 if (!Builder.GetInsertBlock())
3335 return ReductionFunc;
3336 Builder.CreateStore(Reduced, LHSPtr);
3337 }
3338 }
3339
3341 for (auto En : enumerate(ReductionInfos)) {
3342 unsigned Index = En.index();
3343 const ReductionInfo &RI = En.value();
3344 Value *LHSFixupPtr, *RHSFixupPtr;
3345 Builder.restoreIP(RI.ReductionGenClang(
3346 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3347
3348 // Fix the CallBack code genereated to use the correct Values for the LHS
3349 // and RHS
3350 LHSFixupPtr->replaceUsesWithIf(
3351 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3352 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3353 ReductionFunc;
3354 });
3355 RHSFixupPtr->replaceUsesWithIf(
3356 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3357 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3358 ReductionFunc;
3359 });
3360 }
3361
3363 return ReductionFunc;
3364}
3365
3366static void
3368 bool IsGPU) {
3369 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3370 (void)RI;
3371 assert(RI.Variable && "expected non-null variable");
3372 assert(RI.PrivateVariable && "expected non-null private variable");
3373 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3374 "expected non-null reduction generator callback");
3375 if (!IsGPU) {
3376 assert(
3377 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3378 "expected variables and their private equivalents to have the same "
3379 "type");
3380 }
3381 assert(RI.Variable->getType()->isPointerTy() &&
3382 "expected variables to be pointers");
3383 }
3384}
3385
3387 const LocationDescription &Loc, InsertPointTy AllocaIP,
3388 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3389 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3390 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3391 unsigned ReductionBufNum, Value *SrcLocInfo) {
3392 if (!updateToLocation(Loc))
3393 return InsertPointTy();
3394 Builder.restoreIP(CodeGenIP);
3395 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3396 LLVMContext &Ctx = M.getContext();
3397
3398 // Source location for the ident struct
3399 if (!SrcLocInfo) {
3400 uint32_t SrcLocStrSize;
3401 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3402 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3403 }
3404
3405 if (ReductionInfos.size() == 0)
3406 return Builder.saveIP();
3407
3408 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3409 AttributeList FuncAttrs;
3410 AttrBuilder AttrBldr(Ctx);
3411 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3412 AttrBldr.addAttribute(Attr);
3413 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3414 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3415
3416 Function *ReductionFunc = nullptr;
3417 CodeGenIP = Builder.saveIP();
3418 ReductionFunc =
3419 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3420 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3421 Builder.restoreIP(CodeGenIP);
3422
3423 // Set the grid value in the config needed for lowering later on
3424 if (GridValue.has_value())
3425 Config.setGridValue(GridValue.value());
3426 else
3427 Config.setGridValue(getGridValue(T, ReductionFunc));
3428
3429 uint32_t SrcLocStrSize;
3430 Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
3431 Value *RTLoc =
3432 getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
3433
3434 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3435 // RedList, shuffle_reduce_func, interwarp_copy_func);
3436 // or
3437 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3438 Value *Res;
3439
3440 // 1. Build a list of reduction variables.
3441 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3442 auto Size = ReductionInfos.size();
3443 Type *PtrTy = PointerType::getUnqual(Ctx);
3444 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3445 CodeGenIP = Builder.saveIP();
3446 Builder.restoreIP(AllocaIP);
3447 Value *ReductionListAlloca =
3448 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3450 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3451 Builder.restoreIP(CodeGenIP);
3452 Type *IndexTy = Builder.getIndexTy(
3454 for (auto En : enumerate(ReductionInfos)) {
3455 const ReductionInfo &RI = En.value();
3456 Value *ElemPtr = Builder.CreateInBoundsGEP(
3457 RedArrayTy, ReductionList,
3458 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3459 Value *CastElem =
3461 Builder.CreateStore(CastElem, ElemPtr);
3462 }
3463 CodeGenIP = Builder.saveIP();
3464 Function *SarFunc =
3465 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3466 Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3467 Builder.restoreIP(CodeGenIP);
3468
3469 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3470
3471 unsigned MaxDataSize = 0;
3472 SmallVector<Type *> ReductionTypeArgs;
3473 for (auto En : enumerate(ReductionInfos)) {
3474 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3475 if (Size > MaxDataSize)
3476 MaxDataSize = Size;
3477 ReductionTypeArgs.emplace_back(En.value().ElementType);
3478 }
3479 Value *ReductionDataSize =
3480 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3481 if (!IsTeamsReduction) {
3482 Value *SarFuncCast =
3484 Value *WcFuncCast =
3486 Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
3488 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3489 Res = Builder.CreateCall(Pv2Ptr, Args);
3490 } else {
3491 CodeGenIP = Builder.saveIP();
3492 StructType *ReductionsBufferTy = StructType::create(
3493 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3494 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3495 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3496 Function *LtGCFunc = emitListToGlobalCopyFunction(
3497 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3498 Function *LtGRFunc = emitListToGlobalReduceFunction(
3499 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3500 Function *GtLCFunc = emitGlobalToListCopyFunction(
3501 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3502 Function *GtLRFunc = emitGlobalToListReduceFunction(
3503 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3504 Builder.restoreIP(CodeGenIP);
3505
3506 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3507 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3508
3509 Value *Args3[] = {RTLoc,
3510 KernelTeamsReductionPtr,
3511 Builder.getInt32(ReductionBufNum),
3512 ReductionDataSize,
3513 RL,
3514 SarFunc,
3515 WcFunc,
3516 LtGCFunc,
3517 LtGRFunc,
3518 GtLCFunc,
3519 GtLRFunc};
3520
3521 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3522 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3523 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3524 }
3525
3526 // 5. Build if (res == 1)
3527 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3528 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3530 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3531
3532 // 6. Build then branch: where we have reduced values in the master
3533 // thread in each team.
3534 // __kmpc_end_reduce{_nowait}(<gtid>);
3535 // break;
3536 emitBlock(ThenBB, CurFunc);
3537
3538 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3539 for (auto En : enumerate(ReductionInfos)) {
3540 const ReductionInfo &RI = En.value();
3541 Value *LHS = RI.Variable;
3542 Value *RHS =
3544
3546 Value *LHSPtr, *RHSPtr;
3548 &LHSPtr, &RHSPtr, CurFunc));
3549
3550 // Fix the CallBack code genereated to use the correct Values for the LHS
3551 // and RHS
3552 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3553 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3554 ReductionFunc;
3555 });
3556 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3557 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3558 ReductionFunc;
3559 });
3560 } else {
3561 assert(false && "Unhandled ReductionGenCBKind");
3562 }
3563 }
3564 emitBlock(ExitBB, CurFunc);
3565
3567
3568 return Builder.saveIP();
3569}
3570
3572 Type *VoidTy = Type::getVoidTy(M.getContext());
3573 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3574 auto *FuncTy =
3575 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3577 ".omp.reduction.func", &M);
3578}
3579
3582 InsertPointTy AllocaIP,
3583 ArrayRef<ReductionInfo> ReductionInfos,
3584 ArrayRef<bool> IsByRef, bool IsNoWait) {
3585 assert(ReductionInfos.size() == IsByRef.size());
3586 for (const ReductionInfo &RI : ReductionInfos) {
3587 (void)RI;
3588 assert(RI.Variable && "expected non-null variable");
3589 assert(RI.PrivateVariable && "expected non-null private variable");
3590 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3591 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3592 "expected variables and their private equivalents to have the same "
3593 "type");
3594 assert(RI.Variable->getType()->isPointerTy() &&
3595 "expected variables to be pointers");
3596 }
3597
3598 if (!updateToLocation(Loc))
3599 return InsertPointTy();
3600
3601 BasicBlock *InsertBlock = Loc.IP.getBlock();
3602 BasicBlock *ContinuationBlock =
3603 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3604 InsertBlock->getTerminator()->eraseFromParent();
3605
3606 // Create and populate array of type-erased pointers to private reduction
3607 // values.
3608 unsigned NumReductions = ReductionInfos.size();
3609 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3611 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3612
3613 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3614
3615 for (auto En : enumerate(ReductionInfos)) {
3616 unsigned Index = En.index();
3617 const ReductionInfo &RI = En.value();
3618 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3619 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3620 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3621 }
3622
3623 // Emit a call to the runtime function that orchestrates the reduction.
3624 // Declare the reduction function in the process.
3626 Module *Module = Func->getParent();
3627 uint32_t SrcLocStrSize;
3628 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3629 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3630 return RI.AtomicReductionGen;
3631 });
3632 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3633 CanGenerateAtomic
3634 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3635 : IdentFlag(0));
3636 Value *ThreadId = getOrCreateThreadID(Ident);
3637 Constant *NumVariables = Builder.getInt32(NumReductions);
3638 const DataLayout &DL = Module->getDataLayout();
3639 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3640 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3641 Function *ReductionFunc = getFreshReductionFunc(*Module);
3642 Value *Lock = getOMPCriticalRegionLock(".reduction");
3644 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3645 : RuntimeFunction::OMPRTL___kmpc_reduce);
3646 CallInst *ReduceCall =
3647 Builder.CreateCall(ReduceFunc,
3648 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3649 ReductionFunc, Lock},
3650 "reduce");
3651
3652 // Create final reduction entry blocks for the atomic and non-atomic case.
3653 // Emit IR that dispatches control flow to one of the blocks based on the
3654 // reduction supporting the atomic mode.
3655 BasicBlock *NonAtomicRedBlock =
3656 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3657 BasicBlock *AtomicRedBlock =
3658 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3659 SwitchInst *Switch =
3660 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3661 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3662 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3663
3664 // Populate the non-atomic reduction using the elementwise reduction function.
3665 // This loads the elements from the global and private variables and reduces
3666 // them before storing back the result to the global variable.
3667 Builder.SetInsertPoint(NonAtomicRedBlock);
3668 for (auto En : enumerate(ReductionInfos)) {
3669 const ReductionInfo &RI = En.value();
3671 // We have one less load for by-ref case because that load is now inside of
3672 // the reduction region
3673 Value *RedValue = nullptr;
3674 if (!IsByRef[En.index()]) {
3675 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3676 "red.value." + Twine(En.index()));
3677 }
3678 Value *PrivateRedValue =
3680 "red.private.value." + Twine(En.index()));
3681 Value *Reduced;
3682 if (IsByRef[En.index()]) {
3684 PrivateRedValue, Reduced));
3685 } else {
3687 PrivateRedValue, Reduced));
3688 }
3689 if (!Builder.GetInsertBlock())
3690 return InsertPointTy();
3691 // for by-ref case, the load is inside of the reduction region
3692 if (!IsByRef[En.index()])
3693 Builder.CreateStore(Reduced, RI.Variable);
3694 }
3695 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3696 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3697 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3698 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3699 Builder.CreateBr(ContinuationBlock);
3700
3701 // Populate the atomic reduction using the atomic elementwise reduction
3702 // function. There are no loads/stores here because they will be happening
3703 // inside the atomic elementwise reduction.
3704 Builder.SetInsertPoint(AtomicRedBlock);
3705 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3706 for (const ReductionInfo &RI : ReductionInfos) {
3708 RI.Variable, RI.PrivateVariable));
3709 if (!Builder.GetInsertBlock())
3710 return InsertPointTy();
3711 }
3712 Builder.CreateBr(ContinuationBlock);
3713 } else {
3715 }
3716
3717 // Populate the outlined reduction function using the elementwise reduction
3718 // function. Partial values are extracted from the type-erased array of
3719 // pointers to private variables.
3720 BasicBlock *ReductionFuncBlock =
3721 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3722 Builder.SetInsertPoint(ReductionFuncBlock);
3723 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3724 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3725
3726 for (auto En : enumerate(ReductionInfos)) {
3727 const ReductionInfo &RI = En.value();
3729 RedArrayTy, LHSArrayPtr, 0, En.index());
3730 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3731 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3732 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3734 RedArrayTy, RHSArrayPtr, 0, En.index());
3735 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3736 Value *RHSPtr =
3738 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3739 Value *Reduced;
3741 if (!Builder.GetInsertBlock())
3742 return InsertPointTy();
3743 // store is inside of the reduction region when using by-ref
3744 if (!IsByRef[En.index()])
3745 Builder.CreateStore(Reduced, LHSPtr);
3746 }
3748
3749 Builder.SetInsertPoint(ContinuationBlock);
3750 return Builder.saveIP();
3751}
3752
3755 BodyGenCallbackTy BodyGenCB,
3756 FinalizeCallbackTy FiniCB) {
3757
3758 if (!updateToLocation(Loc))
3759 return Loc.IP;
3760
3761 Directive OMPD = Directive::OMPD_master;
3762 uint32_t SrcLocStrSize;
3763 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3764 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3765 Value *ThreadId = getOrCreateThreadID(Ident);
3766 Value *Args[] = {Ident, ThreadId};
3767
3768 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3769 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3770
3771 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3772 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3773
3774 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3775 /*Conditional*/ true, /*hasFinalize*/ true);
3776}
3777
3780 BodyGenCallbackTy BodyGenCB,
3781 FinalizeCallbackTy FiniCB, Value *Filter) {
3782 if (!updateToLocation(Loc))
3783 return Loc.IP;
3784
3785 Directive OMPD = Directive::OMPD_masked;
3786 uint32_t SrcLocStrSize;
3787 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3788 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3789 Value *ThreadId = getOrCreateThreadID(Ident);
3790 Value *Args[] = {Ident, ThreadId, Filter};
3791 Value *ArgsEnd[] = {Ident, ThreadId};
3792
3793 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3794 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3795
3796 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3797 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3798
3799 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3800 /*Conditional*/ true, /*hasFinalize*/ true);
3801}
3802
3804 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3805 BasicBlock *PostInsertBefore, const Twine &Name) {
3806 Module *M = F->getParent();
3807 LLVMContext &Ctx = M->getContext();
3808 Type *IndVarTy = TripCount->getType();
3809
3810 // Create the basic block structure.
3811 BasicBlock *Preheader =
3812 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3813 BasicBlock *Header =
3814 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3815 BasicBlock *Cond =
3816 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3817 BasicBlock *Body =
3818 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3819 BasicBlock *Latch =
3820 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3821 BasicBlock *Exit =
3822 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3823 BasicBlock *After =
3824 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3825
3826 // Use specified DebugLoc for new instructions.
3828
3829 Builder.SetInsertPoint(Preheader);
3830 Builder.CreateBr(Header);
3831
3832 Builder.SetInsertPoint(Header);
3833 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3834 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3836
3838 Value *Cmp =
3839 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3840 Builder.CreateCondBr(Cmp, Body, Exit);
3841
3842 Builder.SetInsertPoint(Body);
3843 Builder.CreateBr(Latch);
3844
3845 Builder.SetInsertPoint(Latch);
3846 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3847 "omp_" + Name + ".next", /*HasNUW=*/true);
3848 Builder.CreateBr(Header);
3849 IndVarPHI->addIncoming(Next, Latch);
3850
3851 Builder.SetInsertPoint(Exit);
3853
3854 // Remember and return the canonical control flow.
3855 LoopInfos.emplace_front();
3856 CanonicalLoopInfo *CL = &LoopInfos.front();
3857
3858 CL->Header = Header;
3859 CL->Cond = Cond;
3860 CL->Latch = Latch;
3861 CL->Exit = Exit;
3862
3863#ifndef NDEBUG
3864 CL->assertOK();
3865#endif
3866 return CL;
3867}
3868
3871 LoopBodyGenCallbackTy BodyGenCB,
3872 Value *TripCount, const Twine &Name) {
3873 BasicBlock *BB = Loc.IP.getBlock();
3874 BasicBlock *NextBB = BB->getNextNode();
3875
3876 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3877 NextBB, NextBB, Name);
3878 BasicBlock *After = CL->getAfter();
3879
3880 // If location is not set, don't connect the loop.
3881 if (updateToLocation(Loc)) {
3882 // Split the loop at the insertion point: Branch to the preheader and move
3883 // every following instruction to after the loop (the After BB). Also, the
3884 // new successor is the loop's after block.
3885 spliceBB(Builder, After, /*CreateBranch=*/false);
3887 }
3888
3889 // Emit the body content. We do it after connecting the loop to the CFG to
3890 // avoid that the callback encounters degenerate BBs.
3891 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
3892
3893#ifndef NDEBUG
3894 CL->assertOK();
3895#endif
3896 return CL;
3897}
3898
3900 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3901 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3902 InsertPointTy ComputeIP, const Twine &Name) {
3903
3904 // Consider the following difficulties (assuming 8-bit signed integers):
3905 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3906 // DO I = 1, 100, 50
3907 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3908 // DO I = 100, 0, -128
3909
3910 // Start, Stop and Step must be of the same integer type.
3911 auto *IndVarTy = cast<IntegerType>(Start->getType());
3912 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
3913 assert(IndVarTy == Step->getType() && "Step type mismatch");
3914
3915 LocationDescription ComputeLoc =
3916 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
3917 updateToLocation(ComputeLoc);
3918
3919 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
3920 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
3921
3922 // Like Step, but always positive.
3923 Value *Incr = Step;
3924
3925 // Distance between Start and Stop; always positive.
3926 Value *Span;
3927
3928 // Condition whether there are no iterations are executed at all, e.g. because
3929 // UB < LB.
3930 Value *ZeroCmp;
3931
3932 if (IsSigned) {
3933 // Ensure that increment is positive. If not, negate and invert LB and UB.
3934 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
3935 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
3936 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
3937 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
3938 Span = Builder.CreateSub(UB, LB, "", false, true);
3939 ZeroCmp = Builder.CreateICmp(
3940 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
3941 } else {
3942 Span = Builder.CreateSub(Stop, Start, "", true);
3943 ZeroCmp = Builder.CreateICmp(
3944 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
3945 }
3946
3947 Value *CountIfLooping;
3948 if (InclusiveStop) {
3949 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
3950 } else {
3951 // Avoid incrementing past stop since it could overflow.
3952 Value *CountIfTwo = Builder.CreateAdd(
3953 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
3954 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
3955 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
3956 }
3957 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
3958 "omp_" + Name + ".tripcount");
3959
3960 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
3961 Builder.restoreIP(CodeGenIP);
3962 Value *Span = Builder.CreateMul(IV, Step);
3963 Value *IndVar = Builder.CreateAdd(Span, Start);
3964 BodyGenCB(Builder.saveIP(), IndVar);
3965 };
3966 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
3967 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
3968}
3969
3970// Returns an LLVM function to call for initializing loop bounds using OpenMP
3971// static scheduling depending on `type`. Only i32 and i64 are supported by the
3972// runtime. Always interpret integers as unsigned similarly to
3973// CanonicalLoopInfo.
3975 OpenMPIRBuilder &OMPBuilder) {
3976 unsigned Bitwidth = Ty->getIntegerBitWidth();
3977 if (Bitwidth == 32)
3978 return OMPBuilder.getOrCreateRuntimeFunction(
3979 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
3980 if (Bitwidth == 64)
3981 return OMPBuilder.getOrCreateRuntimeFunction(
3982 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
3983 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3984}
3985
3987OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
3988 InsertPointTy AllocaIP,
3989 bool NeedsBarrier) {
3990 assert(CLI->isValid() && "Requires a valid canonical loop");
3991 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3992 "Require dedicated allocate IP");
3993
3994 // Set up the source location value for OpenMP runtime.
3997
3998 uint32_t SrcLocStrSize;
3999 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4000 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4001
4002 // Declare useful OpenMP runtime functions.
4003 Value *IV = CLI->getIndVar();
4004 Type *IVTy = IV->getType();
4005 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4006 FunctionCallee StaticFini =
4007 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4008
4009 // Allocate space for computed loop bounds as expected by the "init" function.
4010 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4011
4012 Type *I32Type = Type::getInt32Ty(M.getContext());
4013 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4014 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4015 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4016 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4017
4018 // At the end of the preheader, prepare for calling the "init" function by
4019 // storing the current loop bounds into the allocated space. A canonical loop
4020 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4021 // and produces an inclusive upper bound.
4023 Constant *Zero = ConstantInt::get(IVTy, 0);
4024 Constant *One = ConstantInt::get(IVTy, 1);
4025 Builder.CreateStore(Zero, PLowerBound);
4026 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4027 Builder.CreateStore(UpperBound, PUpperBound);
4028 Builder.CreateStore(One, PStride);
4029
4030 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4031
4032 Constant *SchedulingType = ConstantInt::get(
4033 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4034
4035 // Call the "init" function and update the trip count of the loop with the
4036 // value it produced.
4037 Builder.CreateCall(StaticInit,
4038 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4039 PUpperBound, PStride, One, Zero});
4040 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4041 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4042 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4043 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4044 CLI->setTripCount(TripCount);
4045
4046 // Update all uses of the induction variable except the one in the condition
4047 // block that compares it with the actual upper bound, and the increment in
4048 // the latch block.
4049
4050 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4052 CLI->getBody()->getFirstInsertionPt());
4054 return Builder.CreateAdd(OldIV, LowerBound);
4055 });
4056
4057 // In the "exit" block, call the "fini" function.
4059 CLI->getExit()->getTerminator()->getIterator());
4060 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4061
4062 // Add the barrier if requested.
4063 if (NeedsBarrier)
4064 createBarrier(LocationDescription(Builder.saveIP(), DL),
4065 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4066 /* CheckCancelFlag */ false);
4067
4068 InsertPointTy AfterIP = CLI->getAfterIP();
4069 CLI->invalidate();
4070
4071 return AfterIP;
4072}
4073
4074OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4075 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4076 bool NeedsBarrier, Value *ChunkSize) {
4077 assert(CLI->isValid() && "Requires a valid canonical loop");
4078 assert(ChunkSize && "Chunk size is required");
4079
4080 LLVMContext &Ctx = CLI->getFunction()->getContext();
4081 Value *IV = CLI->getIndVar();
4082 Value *OrigTripCount = CLI->getTripCount();
4083 Type *IVTy = IV->getType();
4084 assert(IVTy->getIntegerBitWidth() <= 64 &&
4085 "Max supported tripcount bitwidth is 64 bits");
4086 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4087 : Type::getInt64Ty(Ctx);
4088 Type *I32Type = Type::getInt32Ty(M.getContext());
4089 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4090 Constant *One = ConstantInt::get(InternalIVTy, 1);
4091
4092 // Declare useful OpenMP runtime functions.
4093 FunctionCallee StaticInit =
4094 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4095 FunctionCallee StaticFini =
4096 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4097
4098 // Allocate space for computed loop bounds as expected by the "init" function.
4099 Builder.restoreIP(AllocaIP);
4101 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4102 Value *PLowerBound =
4103 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4104 Value *PUpperBound =
4105 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4106 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4107
4108 // Set up the source location value for the OpenMP runtime.
4111
4112 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4113 Value *CastedChunkSize =
4114 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4115 Value *CastedTripCount =
4116 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4117
4118 Constant *SchedulingType = ConstantInt::get(
4119 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4120 Builder.CreateStore(Zero, PLowerBound);
4121 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4122 Builder.CreateStore(OrigUpperBound, PUpperBound);
4123 Builder.CreateStore(One, PStride);
4124
4125 // Call the "init" function and update the trip count of the loop with the
4126 // value it produced.
4127 uint32_t SrcLocStrSize;
4128 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4129 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4130 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4131 Builder.CreateCall(StaticInit,
4132 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4133 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4134 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4135 /*pstride=*/PStride, /*incr=*/One,
4136 /*chunk=*/CastedChunkSize});
4137
4138 // Load values written by the "init" function.
4139 Value *FirstChunkStart =
4140 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4141 Value *FirstChunkStop =
4142 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4143 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4144 Value *ChunkRange =
4145 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4146 Value *NextChunkStride =
4147 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4148
4149 // Create outer "dispatch" loop for enumerating the chunks.
4150 BasicBlock *DispatchEnter = splitBB(Builder, true);
4151 Value *DispatchCounter;
4153 {Builder.saveIP(), DL},
4154 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
4155 FirstChunkStart, CastedTripCount, NextChunkStride,
4156 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4157 "dispatch");
4158
4159 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4160 // not have to preserve the canonical invariant.
4161 BasicBlock *DispatchBody = DispatchCLI->getBody();
4162 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4163 BasicBlock *DispatchExit = DispatchCLI->getExit();
4164 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4165 DispatchCLI->invalidate();
4166
4167 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4168 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4169 redirectTo(CLI->getExit(), DispatchLatch, DL);
4170 redirectTo(DispatchBody, DispatchEnter, DL);
4171
4172 // Prepare the prolog of the chunk loop.
4175
4176 // Compute the number of iterations of the chunk loop.
4178 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4179 Value *IsLastChunk =
4180 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4181 Value *CountUntilOrigTripCount =
4182 Builder.CreateSub(CastedTripCount, DispatchCounter);
4183 Value *ChunkTripCount = Builder.CreateSelect(
4184 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4185 Value *BackcastedChunkTC =
4186 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4187 CLI->setTripCount(BackcastedChunkTC);
4188
4189 // Update all uses of the induction variable except the one in the condition
4190 // block that compares it with the actual upper bound, and the increment in
4191 // the latch block.
4192 Value *BackcastedDispatchCounter =
4193 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4194 CLI->mapIndVar([&](Instruction *) -> Value * {
4195 Builder.restoreIP(CLI->getBodyIP());
4196 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4197 });
4198
4199 // In the "exit" block, call the "fini" function.
4200 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4201 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4202
4203 // Add the barrier if requested.
4204 if (NeedsBarrier)
4205 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4206 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4207
4208#ifndef NDEBUG
4209 // Even though we currently do not support applying additional methods to it,
4210 // the chunk loop should remain a canonical loop.
4211 CLI->assertOK();
4212#endif
4213
4214 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
4215}
4216
4217// Returns an LLVM function to call for executing an OpenMP static worksharing
4218// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4219// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4220static FunctionCallee
4222 WorksharingLoopType LoopType) {
4223 unsigned Bitwidth = Ty->getIntegerBitWidth();
4224 Module &M = OMPBuilder->M;
4225 switch (LoopType) {
4226 case WorksharingLoopType::ForStaticLoop:
4227 if (Bitwidth == 32)
4228 return OMPBuilder->getOrCreateRuntimeFunction(
4229 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4230 if (Bitwidth == 64)
4231 return OMPBuilder->getOrCreateRuntimeFunction(
4232 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4233 break;
4234 case WorksharingLoopType::DistributeStaticLoop:
4235 if (Bitwidth == 32)
4236 return OMPBuilder->getOrCreateRuntimeFunction(
4237 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4238 if (Bitwidth == 64)
4239 return OMPBuilder->getOrCreateRuntimeFunction(
4240 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4241 break;
4242 case WorksharingLoopType::DistributeForStaticLoop:
4243 if (Bitwidth == 32)
4244 return OMPBuilder->getOrCreateRuntimeFunction(
4245 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4246 if (Bitwidth == 64)
4247 return OMPBuilder->getOrCreateRuntimeFunction(
4248 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4249 break;
4250 }
4251 if (Bitwidth != 32 && Bitwidth != 64) {
4252 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4253 }
4254 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4255}
4256
4257// Inserts a call to proper OpenMP Device RTL function which handles
4258// loop worksharing.
4260 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4261 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4262 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4263 Type *TripCountTy = TripCount->getType();
4264 Module &M = OMPBuilder->M;
4265 IRBuilder<> &Builder = OMPBuilder->Builder;
4266 FunctionCallee RTLFn =
4267 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4268 SmallVector<Value *, 8> RealArgs;
4269 RealArgs.push_back(Ident);
4270 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4271 RealArgs.push_back(LoopBodyArg);
4272 RealArgs.push_back(TripCount);
4273 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4274 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4275 Builder.CreateCall(RTLFn, RealArgs);
4276 return;
4277 }
4278 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4279 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4280 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4281 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4282
4283 RealArgs.push_back(
4284 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4285 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4286 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4287 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4288 }
4289
4290 Builder.CreateCall(RTLFn, RealArgs);
4291}
4292
4293static void
4295 CanonicalLoopInfo *CLI, Value *Ident,
4296 Function &OutlinedFn, Type *ParallelTaskPtr,
4297 const SmallVector<Instruction *, 4> &ToBeDeleted,
4298 WorksharingLoopType LoopType) {
4299 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4300 BasicBlock *Preheader = CLI->getPreheader();
4301 Value *TripCount = CLI->getTripCount();
4302
4303 // After loop body outling, the loop body contains only set up
4304 // of loop body argument structure and the call to the outlined
4305 // loop body function. Firstly, we need to move setup of loop body args
4306 // into loop preheader.
4307 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4308 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4309
4310 // The next step is to remove the whole loop. We do not it need anymore.
4311 // That's why make an unconditional branch from loop preheader to loop
4312 // exit block
4313 Builder.restoreIP({Preheader, Preheader->end()});
4314 Preheader->getTerminator()->eraseFromParent();
4315 Builder.CreateBr(CLI->getExit());
4316
4317 // Delete dead loop blocks
4318 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4319 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4320 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4321 CleanUpInfo.EntryBB = CLI->getHeader();
4322 CleanUpInfo.ExitBB = CLI->getExit();
4323 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4324 DeleteDeadBlocks(BlocksToBeRemoved);
4325
4326 // Find the instruction which corresponds to loop body argument structure
4327 // and remove the call to loop body function instruction.
4328 Value *LoopBodyArg;
4329 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4330 assert(OutlinedFnUser &&
4331 "Expected unique undroppable user of outlined function");
4332 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4333 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4334 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4335 "Expected outlined function call to be located in loop preheader");
4336 // Check in case no argument structure has been passed.
4337 if (OutlinedFnCallInstruction->arg_size() > 1)
4338 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4339 else
4340 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4341 OutlinedFnCallInstruction->eraseFromParent();
4342
4343 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4344 LoopBodyArg, ParallelTaskPtr, TripCount,
4345 OutlinedFn);
4346
4347 for (auto &ToBeDeletedItem : ToBeDeleted)
4348 ToBeDeletedItem->eraseFromParent();
4349 CLI->invalidate();
4350}
4351
4353OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4354 InsertPointTy AllocaIP,
4355 WorksharingLoopType LoopType) {
4356 uint32_t SrcLocStrSize;
4357 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4358 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4359
4360 OutlineInfo OI;
4361 OI.OuterAllocaBB = CLI->getPreheader();
4362 Function *OuterFn = CLI->getPreheader()->getParent();
4363
4364 // Instructions which need to be deleted at the end of code generation
4366
4367 OI.OuterAllocaBB = AllocaIP.getBlock();
4368
4369 // Mark the body loop as region which needs to be extracted
4370 OI.EntryBB = CLI->getBody();
4371 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4372 "omp.prelatch", true);
4373
4374 // Prepare loop body for extraction
4375 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4376
4377 // Insert new loop counter variable which will be used only in loop
4378 // body.
4379 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4380 Instruction *NewLoopCntLoad =
4381 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4382 // New loop counter instructions are redundant in the loop preheader when
4383 // code generation for workshare loop is finshed. That's why mark them as
4384 // ready for deletion.
4385 ToBeDeleted.push_back(NewLoopCntLoad);
4386 ToBeDeleted.push_back(NewLoopCnt);
4387
4388 // Analyse loop body region. Find all input variables which are used inside
4389 // loop body region.
4390 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4392 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4393 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4394 ParallelRegionBlockSet.end());
4395
4396 CodeExtractorAnalysisCache CEAC(*OuterFn);
4397 CodeExtractor Extractor(Blocks,
4398 /* DominatorTree */ nullptr,
4399 /* AggregateArgs */ true,
4400 /* BlockFrequencyInfo */ nullptr,
4401 /* BranchProbabilityInfo */ nullptr,
4402 /* AssumptionCache */ nullptr,
4403 /* AllowVarArgs */ true,
4404 /* AllowAlloca */ true,
4405 /* AllocationBlock */ CLI->getPreheader(),
4406 /* Suffix */ ".omp_wsloop",
4407 /* AggrArgsIn0AddrSpace */ true);
4408
4409 BasicBlock *CommonExit = nullptr;
4410 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4411
4412 // Find allocas outside the loop body region which are used inside loop
4413 // body
4414 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4415
4416 // We need to model loop body region as the function f(cnt, loop_arg).
4417 // That's why we replace loop induction variable by the new counter
4418 // which will be one of loop body function argument
4420 CLI->getIndVar()->user_end());
4421 for (auto Use : Users) {
4422 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4423 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4424 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4425 }
4426 }
4427 }
4428 // Make sure that loop counter variable is not merged into loop body
4429 // function argument structure and it is passed as separate variable
4430 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4431
4432 // PostOutline CB is invoked when loop body function is outlined and
4433 // loop body is replaced by call to outlined function. We need to add
4434 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4435 // function will handle loop control logic.
4436 //
4437 OI.PostOutlineCB = [=, ToBeDeletedVec =
4438 std::move(ToBeDeleted)](Function &OutlinedFn) {
4439 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4440 ToBeDeletedVec, LoopType);
4441 };
4442 addOutlineInfo(std::move(OI));
4443 return CLI->getAfterIP();
4444}
4445
4448 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4449 bool HasSimdModifier, bool HasMonotonicModifier,
4450 bool HasNonmonotonicModifier, bool HasOrderedClause,
4451 WorksharingLoopType LoopType) {
4452 if (Config.isTargetDevice())
4453 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4454 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4455 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4456 HasNonmonotonicModifier, HasOrderedClause);
4457
4458 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4459 OMPScheduleType::ModifierOrdered;
4460 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4461 case OMPScheduleType::BaseStatic:
4462 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4463 if (IsOrdered)
4464 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4465 NeedsBarrier, ChunkSize);
4466 // FIXME: Monotonicity ignored?
4467 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4468
4469 case OMPScheduleType::BaseStaticChunked:
4470 if (IsOrdered)
4471 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4472 NeedsBarrier, ChunkSize);
4473 // FIXME: Monotonicity ignored?
4474 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4475 ChunkSize);
4476
4477 case OMPScheduleType::BaseRuntime:
4478 case OMPScheduleType::BaseAuto:
4479 case OMPScheduleType::BaseGreedy:
4480 case OMPScheduleType::BaseBalanced:
4481 case OMPScheduleType::BaseSteal:
4482 case OMPScheduleType::BaseGuidedSimd:
4483 case OMPScheduleType::BaseRuntimeSimd:
4484 assert(!ChunkSize &&
4485 "schedule type does not support user-defined chunk sizes");
4486 [[fallthrough]];
4487 case OMPScheduleType::BaseDynamicChunked:
4488 case OMPScheduleType::BaseGuidedChunked:
4489 case OMPScheduleType::BaseGuidedIterativeChunked:
4490 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4491 case OMPScheduleType::BaseStaticBalancedChunked:
4492 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4493 NeedsBarrier, ChunkSize);
4494
4495 default:
4496 llvm_unreachable("Unknown/unimplemented schedule kind");
4497 }
4498}
4499
4500/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4501/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4502/// the runtime. Always interpret integers as unsigned similarly to
4503/// CanonicalLoopInfo.
4504static FunctionCallee
4506 unsigned Bitwidth = Ty->getIntegerBitWidth();
4507 if (Bitwidth == 32)
4508 return OMPBuilder.getOrCreateRuntimeFunction(
4509 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4510 if (Bitwidth == 64)
4511 return OMPBuilder.getOrCreateRuntimeFunction(
4512 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4513 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4514}
4515
4516/// Returns an LLVM function to call for updating the next loop using OpenMP
4517/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4518/// the runtime. Always interpret integers as unsigned similarly to
4519/// CanonicalLoopInfo.
4520static FunctionCallee
4522 unsigned Bitwidth = Ty->getIntegerBitWidth();
4523 if (Bitwidth == 32)
4524 return OMPBuilder.getOrCreateRuntimeFunction(
4525 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4526 if (Bitwidth == 64)
4527 return OMPBuilder.getOrCreateRuntimeFunction(
4528 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4529 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4530}
4531
4532/// Returns an LLVM function to call for finalizing the dynamic loop using
4533/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4534/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4535static FunctionCallee
4537 unsigned Bitwidth = Ty->getIntegerBitWidth();
4538 if (Bitwidth == 32)
4539 return OMPBuilder.getOrCreateRuntimeFunction(
4540 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4541 if (Bitwidth == 64)
4542 return OMPBuilder.getOrCreateRuntimeFunction(
4543 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4544 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4545}
4546
4547OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
4548 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4549 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
4550 assert(CLI->isValid() && "Requires a valid canonical loop");
4551 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4552 "Require dedicated allocate IP");
4554 "Require valid schedule type");
4555
4556 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4557 OMPScheduleType::ModifierOrdered;
4558
4559 // Set up the source location value for OpenMP runtime.
4561
4562 uint32_t SrcLocStrSize;
4563 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4564 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4565
4566 // Declare useful OpenMP runtime functions.
4567 Value *IV = CLI->getIndVar();
4568 Type *IVTy = IV->getType();
4569 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4570 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4571
4572 // Allocate space for computed loop bounds as expected by the "init" function.
4573 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4574 Type *I32Type = Type::getInt32Ty(M.getContext());
4575 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4576 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4577 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4578 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4579
4580 // At the end of the preheader, prepare for calling the "init" function by
4581 // storing the current loop bounds into the allocated space. A canonical loop
4582 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4583 // and produces an inclusive upper bound.
4584 BasicBlock *PreHeader = CLI->getPreheader();
4585 Builder.SetInsertPoint(PreHeader->getTerminator());
4586 Constant *One = ConstantInt::get(IVTy, 1);
4587 Builder.CreateStore(One, PLowerBound);
4588 Value *UpperBound = CLI->getTripCount();
4589 Builder.CreateStore(UpperBound, PUpperBound);
4590 Builder.CreateStore(One, PStride);
4591
4592 BasicBlock *Header = CLI->getHeader();
4593 BasicBlock *Exit = CLI->getExit();
4594 BasicBlock *Cond = CLI->getCond();
4595 BasicBlock *Latch = CLI->getLatch();
4596 InsertPointTy AfterIP = CLI->getAfterIP();
4597
4598 // The CLI will be "broken" in the code below, as the loop is no longer
4599 // a valid canonical loop.
4600
4601 if (!Chunk)
4602 Chunk = One;
4603
4604 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4605
4606 Constant *SchedulingType =
4607 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4608
4609 // Call the "init" function.
4610 Builder.CreateCall(DynamicInit,
4611 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4612 UpperBound, /* step */ One, Chunk});
4613
4614 // An outer loop around the existing one.
4615 BasicBlock *OuterCond = BasicBlock::Create(
4616 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4617 PreHeader->getParent());
4618 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4619 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4620 Value *Res =
4621 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4622 PLowerBound, PUpperBound, PStride});
4623 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4624 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4625 Value *LowerBound =
4626 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4627 Builder.CreateCondBr(MoreWork, Header, Exit);
4628
4629 // Change PHI-node in loop header to use outer cond rather than preheader,
4630 // and set IV to the LowerBound.
4631 Instruction *Phi = &Header->front();
4632 auto *PI = cast<PHINode>(Phi);
4633 PI->setIncomingBlock(0, OuterCond);
4634 PI->setIncomingValue(0, LowerBound);
4635
4636 // Then set the pre-header to jump to the OuterCond
4637 Instruction *Term = PreHeader->getTerminator();
4638 auto *Br = cast<BranchInst>(Term);
4639 Br->setSuccessor(0, OuterCond);
4640
4641 // Modify the inner condition:
4642 // * Use the UpperBound returned from the DynamicNext call.
4643 // * jump to the loop outer loop when done with one of the inner loops.
4644 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4645 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4647 auto *CI = cast<CmpInst>(Comp);
4648 CI->setOperand(1, UpperBound);
4649 // Redirect the inner exit to branch to outer condition.
4650 Instruction *Branch = &Cond->back();
4651 auto *BI = cast<BranchInst>(Branch);
4652 assert(BI->getSuccessor(1) == Exit);
4653 BI->setSuccessor(1, OuterCond);
4654
4655 // Call the "fini" function if "ordered" is present in wsloop directive.
4656 if (Ordered) {
4657 Builder.SetInsertPoint(&Latch->back());
4658 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4659 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4660 }
4661
4662 // Add the barrier if requested.
4663 if (NeedsBarrier) {
4664 Builder.SetInsertPoint(&Exit->back());
4665 createBarrier(LocationDescription(Builder.saveIP(), DL),
4666 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4667 /* CheckCancelFlag */ false);
4668 }
4669
4670 CLI->invalidate();
4671 return AfterIP;
4672}
4673
4674/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4675/// after this \p OldTarget will be orphaned.
4677 BasicBlock *NewTarget, DebugLoc DL) {
4678 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4679 redirectTo(Pred, NewTarget, DL);
4680}
4681
4682/// Determine which blocks in \p BBs are reachable from outside and remove the
4683/// ones that are not reachable from the function.
4685 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4686 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4687 for (Use &U : BB->uses()) {
4688 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4689 if (!UseInst)
4690 continue;
4691 if (BBsToErase.count(UseInst->getParent()))
4692 continue;
4693 return true;
4694 }
4695 return false;
4696 };
4697
4698 while (BBsToErase.remove_if(HasRemainingUses)) {
4699 // Try again if anything was removed.
4700 }
4701
4702 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4703 DeleteDeadBlocks(BBVec);
4704}
4705
4708 InsertPointTy ComputeIP) {
4709 assert(Loops.size() >= 1 && "At least one loop required");
4710 size_t NumLoops = Loops.size();
4711
4712 // Nothing to do if there is already just one loop.
4713 if (NumLoops == 1)
4714 return Loops.front();
4715
4716 CanonicalLoopInfo *Outermost = Loops.front();
4717 CanonicalLoopInfo *Innermost = Loops.back();
4718 BasicBlock *OrigPreheader = Outermost->getPreheader();
4719 BasicBlock *OrigAfter = Outermost->getAfter();
4720 Function *F = OrigPreheader->getParent();
4721
4722 // Loop control blocks that may become orphaned later.
4723 SmallVector<BasicBlock *, 12> OldControlBBs;
4724 OldControlBBs.reserve(6 * Loops.size());
4726 Loop->collectControlBlocks(OldControlBBs);
4727
4728 // Setup the IRBuilder for inserting the trip count computation.
4730 if (ComputeIP.isSet())
4731 Builder.restoreIP(ComputeIP);
4732 else
4733 Builder.restoreIP(Outermost->getPreheaderIP());
4734
4735 // Derive the collapsed' loop trip count.
4736 // TODO: Find common/largest indvar type.
4737 Value *CollapsedTripCount = nullptr;
4738 for (CanonicalLoopInfo *L : Loops) {
4739 assert(L->isValid() &&
4740 "All loops to collapse must be valid canonical loops");
4741 Value *OrigTripCount = L->getTripCount();
4742 if (!CollapsedTripCount) {
4743 CollapsedTripCount = OrigTripCount;
4744 continue;
4745 }
4746
4747 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4748 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4749 {}, /*HasNUW=*/true);
4750 }
4751
4752 // Create the collapsed loop control flow.
4753 CanonicalLoopInfo *Result =
4754 createLoopSkeleton(DL, CollapsedTripCount, F,
4755 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4756
4757 // Build the collapsed loop body code.
4758 // Start with deriving the input loop induction variables from the collapsed
4759 // one, using a divmod scheme. To preserve the original loops' order, the
4760 // innermost loop use the least significant bits.
4761 Builder.restoreIP(Result->getBodyIP());
4762
4763 Value *Leftover = Result->getIndVar();
4764 SmallVector<Value *> NewIndVars;
4765 NewIndVars.resize(NumLoops);
4766 for (int i = NumLoops - 1; i >= 1; --i) {
4767 Value *OrigTripCount = Loops[i]->getTripCount();
4768
4769 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4770 NewIndVars[i] = NewIndVar;
4771
4772 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4773 }
4774 // Outermost loop gets all the remaining bits.
4775 NewIndVars[0] = Leftover;
4776
4777 // Construct the loop body control flow.
4778 // We progressively construct the branch structure following in direction of
4779 // the control flow, from the leading in-between code, the loop nest body, the
4780 // trailing in-between code, and rejoining the collapsed loop's latch.
4781 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4782 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4783 // its predecessors as sources.
4784 BasicBlock *ContinueBlock = Result->getBody();
4785 BasicBlock *ContinuePred = nullptr;
4786 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4787 BasicBlock *NextSrc) {
4788 if (ContinueBlock)
4789 redirectTo(ContinueBlock, Dest, DL);
4790 else
4791 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4792
4793 ContinueBlock = nullptr;
4794 ContinuePred = NextSrc;
4795 };
4796
4797 // The code before the nested loop of each level.
4798 // Because we are sinking it into the nest, it will be executed more often
4799 // that the original loop. More sophisticated schemes could keep track of what
4800 // the in-between code is and instantiate it only once per thread.
4801 for (size_t i = 0; i < NumLoops - 1; ++i)
4802 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4803
4804 // Connect the loop nest body.
4805 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4806
4807 // The code after the nested loop at each level.
4808 for (size_t i = NumLoops - 1; i > 0; --i)
4809 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4810
4811 // Connect the finished loop to the collapsed loop latch.
4812 ContinueWith(Result->getLatch(), nullptr);
4813
4814 // Replace the input loops with the new collapsed loop.
4815 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4816 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4817
4818 // Replace the input loop indvars with the derived ones.
4819 for (size_t i = 0; i < NumLoops; ++i)
4820 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4821
4822 // Remove unused parts of the input loops.
4823 removeUnusedBlocksFromParent(OldControlBBs);
4824
4825 for (CanonicalLoopInfo *L : Loops)
4826 L->invalidate();
4827
4828#ifndef NDEBUG
4829 Result->assertOK();
4830#endif
4831 return Result;
4832}
4833
4834std::vector<CanonicalLoopInfo *>
4836 ArrayRef<Value *> TileSizes) {
4837 assert(TileSizes.size() == Loops.size() &&
4838 "Must pass as many tile sizes as there are loops");
4839 int NumLoops = Loops.size();
4840 assert(NumLoops >= 1 && "At least one loop to tile required");
4841
4842 CanonicalLoopInfo *OutermostLoop = Loops.front();
4843 CanonicalLoopInfo *InnermostLoop = Loops.back();
4844 Function *F = OutermostLoop->getBody()->getParent();
4845 BasicBlock *InnerEnter = InnermostLoop->getBody();
4846 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4847
4848 // Loop control blocks that may become orphaned later.
4849 SmallVector<BasicBlock *, 12> OldControlBBs;
4850 OldControlBBs.reserve(6 * Loops.size());
4852 Loop->collectControlBlocks(OldControlBBs);
4853
4854 // Collect original trip counts and induction variable to be accessible by
4855 // index. Also, the structure of the original loops is not preserved during
4856 // the construction of the tiled loops, so do it before we scavenge the BBs of
4857 // any original CanonicalLoopInfo.
4858 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4859 for (CanonicalLoopInfo *L : Loops) {
4860 assert(L->isValid() && "All input loops must be valid canonical loops");
4861 OrigTripCounts.push_back(L->getTripCount());
4862 OrigIndVars.push_back(L->getIndVar());
4863 }
4864
4865 // Collect the code between loop headers. These may contain SSA definitions
4866 // that are used in the loop nest body. To be usable with in the innermost
4867 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4868 // these instructions may be executed more often than before the tiling.
4869 // TODO: It would be sufficient to only sink them into body of the
4870 // corresponding tile loop.
4872 for (int i = 0; i < NumLoops - 1; ++i) {
4873 CanonicalLoopInfo *Surrounding = Loops[i];
4874 CanonicalLoopInfo *Nested = Loops[i + 1];
4875
4876 BasicBlock *EnterBB = Surrounding->getBody();
4877 BasicBlock *ExitBB = Nested->getHeader();
4878 InbetweenCode.emplace_back(EnterBB, ExitBB);
4879 }
4880
4881 // Compute the trip counts of the floor loops.
4883 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4884 SmallVector<Value *, 4> FloorCount, FloorRems;
4885 for (int i = 0; i < NumLoops; ++i) {
4886 Value *TileSize = TileSizes[i];
4887 Value *OrigTripCount = OrigTripCounts[i];
4888 Type *IVType = OrigTripCount->getType();
4889
4890 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
4891 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
4892
4893 // 0 if tripcount divides the tilesize, 1 otherwise.
4894 // 1 means we need an additional iteration for a partial tile.
4895 //
4896 // Unfortunately we cannot just use the roundup-formula
4897 // (tripcount + tilesize - 1)/tilesize
4898 // because the summation might overflow. We do not want introduce undefined
4899 // behavior when the untiled loop nest did not.
4900 Value *FloorTripOverflow =
4901 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
4902
4903 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
4904 FloorTripCount =
4905 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
4906 "omp_floor" + Twine(i) + ".tripcount", true);
4907
4908 // Remember some values for later use.
4909 FloorCount.push_back(FloorTripCount);
4910 FloorRems.push_back(FloorTripRem);
4911 }
4912
4913 // Generate the new loop nest, from the outermost to the innermost.
4914 std::vector<CanonicalLoopInfo *> Result;
4915 Result.reserve(NumLoops * 2);
4916
4917 // The basic block of the surrounding loop that enters the nest generated
4918 // loop.
4919 BasicBlock *Enter = OutermostLoop->getPreheader();
4920
4921 // The basic block of the surrounding loop where the inner code should
4922 // continue.
4923 BasicBlock *Continue = OutermostLoop->getAfter();
4924
4925 // Where the next loop basic block should be inserted.
4926 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
4927
4928 auto EmbeddNewLoop =
4929 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
4930 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
4931 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
4932 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
4933 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
4934 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
4935
4936 // Setup the position where the next embedded loop connects to this loop.
4937 Enter = EmbeddedLoop->getBody();
4938 Continue = EmbeddedLoop->getLatch();
4939 OutroInsertBefore = EmbeddedLoop->getLatch();
4940 return EmbeddedLoop;
4941 };
4942
4943 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
4944 const Twine &NameBase) {
4945 for (auto P : enumerate(TripCounts)) {
4946 CanonicalLoopInfo *EmbeddedLoop =
4947 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
4948 Result.push_back(EmbeddedLoop);
4949 }
4950 };
4951
4952 EmbeddNewLoops(FloorCount, "floor");
4953
4954 // Within the innermost floor loop, emit the code that computes the tile
4955 // sizes.
4957 SmallVector<Value *, 4> TileCounts;
4958 for (int i = 0; i < NumLoops; ++i) {
4959 CanonicalLoopInfo *FloorLoop = Result[i];
4960 Value *TileSize = TileSizes[i];
4961
4962 Value *FloorIsEpilogue =
4963 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
4964 Value *TileTripCount =
4965 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
4966
4967 TileCounts.push_back(TileTripCount);
4968 }
4969
4970 // Create the tile loops.
4971 EmbeddNewLoops(TileCounts, "tile");
4972
4973 // Insert the inbetween code into the body.
4974 BasicBlock *BodyEnter = Enter;
4975 BasicBlock *BodyEntered = nullptr;
4976 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
4977 BasicBlock *EnterBB = P.first;
4978 BasicBlock *ExitBB = P.second;
4979
4980 if (BodyEnter)
4981 redirectTo(BodyEnter, EnterBB, DL);
4982 else
4983 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
4984
4985 BodyEnter = nullptr;
4986 BodyEntered = ExitBB;
4987 }
4988
4989 // Append the original loop nest body into the generated loop nest body.
4990 if (BodyEnter)
4991 redirectTo(BodyEnter, InnerEnter, DL);
4992 else
4993 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
4995
4996 // Replace the original induction variable with an induction variable computed
4997 // from the tile and floor induction variables.
4998 Builder.restoreIP(Result.back()->getBodyIP());
4999 for (int i = 0; i < NumLoops; ++i) {
5000 CanonicalLoopInfo *FloorLoop = Result[i];
5001 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5002 Value *OrigIndVar = OrigIndVars[i];
5003 Value *Size = TileSizes[i];
5004
5005 Value *Scale =
5006 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5007 Value *Shift =
5008 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5009 OrigIndVar->replaceAllUsesWith(Shift);
5010 }
5011
5012 // Remove unused parts of the original loops.
5013 removeUnusedBlocksFromParent(OldControlBBs);
5014
5015 for (CanonicalLoopInfo *L : Loops)
5016 L->invalidate();
5017
5018#ifndef NDEBUG
5019 for (CanonicalLoopInfo *GenL : Result)
5020 GenL->assertOK();
5021#endif
5022 return Result;
5023}
5024
5025/// Attach metadata \p Properties to the basic block described by \p BB. If the
5026/// basic block already has metadata, the basic block properties are appended.
5028 ArrayRef<Metadata *> Properties) {
5029 // Nothing to do if no property to attach.
5030 if (Properties.empty())
5031 return;
5032
5033 LLVMContext &Ctx = BB->getContext();
5034 SmallVector<Metadata *> NewProperties;
5035 NewProperties.push_back(nullptr);
5036
5037 // If the basic block already has metadata, prepend it to the new metadata.
5038 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5039 if (Existing)
5040 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5041
5042 append_range(NewProperties, Properties);
5043 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5044 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5045
5046 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5047}
5048
5049/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5050/// loop already has metadata, the loop properties are appended.
5052 ArrayRef<Metadata *> Properties) {
5053 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5054
5055 // Attach metadata to the loop's latch
5056 BasicBlock *Latch = Loop->getLatch();
5057 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5058 addBasicBlockMetadata(Latch, Properties);
5059}
5060
5061/// Attach llvm.access.group metadata to the memref instructions of \p Block
5062static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5063 LoopInfo &LI) {
5064 for (Instruction &I : *Block) {
5065 if (I.mayReadOrWriteMemory()) {
5066 // TODO: This instruction may already have access group from
5067 // other pragmas e.g. #pragma clang loop vectorize. Append
5068 // so that the existing metadata is not overwritten.
5069 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5070 }
5071 }
5072}
5073
5077 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5078 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5079}
5080
5084 Loop, {
5085 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5086 });
5087}
5088
5089void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5090 Value *IfCond, ValueToValueMapTy &VMap,
5091 const Twine &NamePrefix) {
5092 Function *F = CanonicalLoop->getFunction();
5093
5094 // Define where if branch should be inserted
5095 Instruction *SplitBefore;
5096 if (Instruction::classof(IfCond)) {
5097 SplitBefore = dyn_cast<Instruction>(IfCond);
5098 } else {
5099 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5100 }
5101
5102 // TODO: We should not rely on pass manager. Currently we use pass manager
5103 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5104 // object. We should have a method which returns all blocks between
5105 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5107 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5108 FAM.registerPass([]() { return LoopAnalysis(); });
5109 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5110
5111 // Get the loop which needs to be cloned
5112 LoopAnalysis LIA;
5113 LoopInfo &&LI = LIA.run(*F, FAM);
5114 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5115
5116 // Create additional blocks for the if statement
5117 BasicBlock *Head = SplitBefore->getParent();
5118 Instruction *HeadOldTerm = Head->getTerminator();
5119 llvm::LLVMContext &C = Head->getContext();
5121 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5123 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5124
5125 // Create if condition branch.
5126 Builder.SetInsertPoint(HeadOldTerm);
5127 Instruction *BrInstr =
5128 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5129 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5130 // Then block contains branch to omp loop which needs to be vectorized
5131 spliceBB(IP, ThenBlock, false);
5132 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5133
5134 Builder.SetInsertPoint(ElseBlock);
5135
5136 // Clone loop for the else branch
5138
5139 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5140 for (BasicBlock *Block : L->getBlocks()) {
5141 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5142 NewBB->moveBefore(CanonicalLoop->getExit());
5143 VMap[Block] = NewBB;
5144 NewBlocks.push_back(NewBB);
5145 }
5146 remapInstructionsInBlocks(NewBlocks, VMap);
5147 Builder.CreateBr(NewBlocks.front());
5148}
5149
5150unsigned
5152 const StringMap<bool> &Features) {
5153 if (TargetTriple.isX86()) {
5154 if (Features.lookup("avx512f"))
5155 return 512;
5156 else if (Features.lookup("avx"))
5157 return 256;
5158 return 128;
5159 }
5160 if (TargetTriple.isPPC())
5161 return 128;
5162 if (TargetTriple.isWasm())
5163 return 128;
5164 return 0;
5165}
5166
5168 MapVector<Value *, Value *> AlignedVars,
5169 Value *IfCond, OrderKind Order,
5170 ConstantInt *Simdlen, ConstantInt *Safelen) {
5172
5173 Function *F = CanonicalLoop->getFunction();
5174
5175 // TODO: We should not rely on pass manager. Currently we use pass manager
5176 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5177 // object. We should have a method which returns all blocks between
5178 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5180 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5181 FAM.registerPass([]() { return LoopAnalysis(); });
5182 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5183
5184 LoopAnalysis LIA;
5185 LoopInfo &&LI = LIA.run(*F, FAM);
5186
5187 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5188 if (AlignedVars.size()) {
5190 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
5191 for (auto &AlignedItem : AlignedVars) {
5192 Value *AlignedPtr = AlignedItem.first;
5193 Value *Alignment = AlignedItem.second;
5194 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5195 AlignedPtr, Alignment);
5196 }
5197 Builder.restoreIP(IP);
5198 }
5199
5200 if (IfCond) {
5201 ValueToValueMapTy VMap;
5202 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5203 // Add metadata to the cloned loop which disables vectorization
5204 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5205 assert(MappedLatch &&
5206 "Cannot find value which corresponds to original loop latch");
5207 assert(isa<BasicBlock>(MappedLatch) &&
5208 "Cannot cast mapped latch block value to BasicBlock");
5209 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5210 ConstantAsMetadata *BoolConst =
5213 NewLatchBlock,
5214 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5215 BoolConst})});
5216 }
5217
5218 SmallSet<BasicBlock *, 8> Reachable;
5219
5220 // Get the basic blocks from the loop in which memref instructions
5221 // can be found.
5222 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5223 // preferably without running any passes.
5224 for (BasicBlock *Block : L->getBlocks()) {
5225 if (Block == CanonicalLoop->getCond() ||
5226 Block == CanonicalLoop->getHeader())
5227 continue;
5228 Reachable.insert(Block);
5229 }
5230
5231 SmallVector<Metadata *> LoopMDList;
5232
5233 // In presence of finite 'safelen', it may be unsafe to mark all
5234 // the memory instructions parallel, because loop-carried
5235 // dependences of 'safelen' iterations are possible.
5236 // If clause order(concurrent) is specified then the memory instructions
5237 // are marked parallel even if 'safelen' is finite.
5238 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5239 // Add access group metadata to memory-access instructions.
5240 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5241 for (BasicBlock *BB : Reachable)
5242 addSimdMetadata(BB, AccessGroup, LI);
5243 // TODO: If the loop has existing parallel access metadata, have
5244 // to combine two lists.
5245 LoopMDList.push_back(MDNode::get(
5246 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5247 }
5248
5249 // Use the above access group metadata to create loop level
5250 // metadata, which should be distinct for each loop.
5251 ConstantAsMetadata *BoolConst =
5253 LoopMDList.push_back(MDNode::get(
5254 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5255
5256 if (Simdlen || Safelen) {
5257 // If both simdlen and safelen clauses are specified, the value of the
5258 // simdlen parameter must be less than or equal to the value of the safelen
5259 // parameter. Therefore, use safelen only in the absence of simdlen.
5260 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5261 LoopMDList.push_back(
5262 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5263 ConstantAsMetadata::get(VectorizeWidth)}));
5264 }
5265
5266 addLoopMetadata(CanonicalLoop, LoopMDList);
5267}
5268
5269/// Create the TargetMachine object to query the backend for optimization
5270/// preferences.
5271///
5272/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5273/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5274/// needed for the LLVM pass pipline. We use some default options to avoid
5275/// having to pass too many settings from the frontend that probably do not
5276/// matter.
5277///
5278/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5279/// method. If we are going to use TargetMachine for more purposes, especially
5280/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5281/// might become be worth requiring front-ends to pass on their TargetMachine,
5282/// or at least cache it between methods. Note that while fontends such as Clang
5283/// have just a single main TargetMachine per translation unit, "target-cpu" and
5284/// "target-features" that determine the TargetMachine are per-function and can
5285/// be overrided using __attribute__((target("OPTIONS"))).
5286static std::unique_ptr<TargetMachine>
5288 Module *M = F->getParent();
5289
5290 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5291 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5292 const std::string &Triple = M->getTargetTriple();
5293
5294 std::string Error;
5296 if (!TheTarget)
5297 return {};
5298
5300 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5301 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5302 /*CodeModel=*/std::nullopt, OptLevel));
5303}
5304
5305/// Heuristically determine the best-performant unroll factor for \p CLI. This
5306/// depends on the target processor. We are re-using the same heuristics as the
5307/// LoopUnrollPass.
5309 Function *F = CLI->getFunction();
5310
5311 // Assume the user requests the most aggressive unrolling, even if the rest of
5312 // the code is optimized using a lower setting.
5314 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5315
5317 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5318 FAM.registerPass([]() { return AssumptionAnalysis(); });
5319 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5320 FAM.registerPass([]() { return LoopAnalysis(); });
5321 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5322 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5323 TargetIRAnalysis TIRA;
5324 if (TM)
5325 TIRA = TargetIRAnalysis(
5326 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5327 FAM.registerPass([&]() { return TIRA; });
5328
5329 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5331 ScalarEvolution &&SE = SEA.run(*F, FAM);
5333 DominatorTree &&DT = DTA.run(*F, FAM);
5334 LoopAnalysis LIA;
5335 LoopInfo &&LI = LIA.run(*F, FAM);
5337 AssumptionCache &&AC = ACT.run(*F, FAM);
5339
5340 Loop *L = LI.getLoopFor(CLI->getHeader());
5341 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5342
5345 /*BlockFrequencyInfo=*/nullptr,
5346 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5347 /*UserThreshold=*/std::nullopt,
5348 /*UserCount=*/std::nullopt,
5349 /*UserAllowPartial=*/true,
5350 /*UserAllowRuntime=*/true,
5351 /*UserUpperBound=*/std::nullopt,
5352 /*UserFullUnrollMaxCount=*/std::nullopt);
5353
5354 UP.Force = true;
5355
5356 // Account for additional optimizations taking place before the LoopUnrollPass
5357 // would unroll the loop.
5360
5361 // Use normal unroll factors even if the rest of the code is optimized for
5362 // size.
5365
5366 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5367 << " Threshold=" << UP.Threshold << "\n"
5368 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5369 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5370 << " PartialOptSizeThreshold="
5371 << UP.PartialOptSizeThreshold << "\n");
5372
5373 // Disable peeling.
5376 /*UserAllowPeeling=*/false,
5377 /*UserAllowProfileBasedPeeling=*/false,
5378 /*UnrollingSpecficValues=*/false);
5379
5381 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5382
5383 // Assume that reads and writes to stack variables can be eliminated by
5384 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5385 // size.
5386 for (BasicBlock *BB : L->blocks()) {
5387 for (Instruction &I : *BB) {
5388 Value *Ptr;
5389 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5390 Ptr = Load->getPointerOperand();
5391 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5392 Ptr = Store->getPointerOperand();
5393 } else
5394 continue;
5395
5396 Ptr = Ptr->stripPointerCasts();
5397
5398 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5399 if (Alloca->getParent() == &F->getEntryBlock())
5400 EphValues.insert(&I);
5401 }
5402 }
5403 }
5404
5405 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5406
5407 // Loop is not unrollable if the loop contains certain instructions.
5408 if (!UCE.canUnroll()) {
5409 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5410 return 1;
5411 }
5412
5413 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5414 << "\n");
5415
5416 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5417 // be able to use it.
5418 int TripCount = 0;
5419 int MaxTripCount = 0;
5420 bool MaxOrZero = false;
5421 unsigned TripMultiple = 0;
5422
5423 bool UseUpperBound = false;
5424 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5425 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5426 UseUpperBound);
5427 unsigned Factor = UP.Count;
5428 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5429
5430 // This function returns 1 to signal to not unroll a loop.
5431 if (Factor == 0)
5432 return 1;
5433 return Factor;
5434}
5435
5437 int32_t Factor,
5438 CanonicalLoopInfo **UnrolledCLI) {
5439 assert(Factor >= 0 && "Unroll factor must not be negative");
5440
5441 Function *F = Loop->getFunction();
5442 LLVMContext &Ctx = F->getContext();
5443
5444 // If the unrolled loop is not used for another loop-associated directive, it
5445 // is sufficient to add metadata for the LoopUnrollPass.
5446 if (!UnrolledCLI) {
5447 SmallVector<Metadata *, 2> LoopMetadata;
5448 LoopMetadata.push_back(
5449 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5450
5451 if (Factor >= 1) {
5453 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5454 LoopMetadata.push_back(MDNode::get(
5455 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5456 }
5457
5458 addLoopMetadata(Loop, LoopMetadata);
5459 return;
5460 }
5461
5462 // Heuristically determine the unroll factor.
5463 if (Factor == 0)
5465
5466 // No change required with unroll factor 1.
5467 if (Factor == 1) {
5468 *UnrolledCLI = Loop;
5469 return;
5470 }
5471
5472 assert(Factor >= 2 &&
5473 "unrolling only makes sense with a factor of 2 or larger");
5474
5475 Type *IndVarTy = Loop->getIndVarType();
5476
5477 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5478 // unroll the inner loop.
5479 Value *FactorVal =
5480 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5481 /*isSigned=*/false));
5482 std::vector<CanonicalLoopInfo *> LoopNest =
5483 tileLoops(DL, {Loop}, {FactorVal});
5484 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5485 *UnrolledCLI = LoopNest[0];
5486 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5487
5488 // LoopUnrollPass can only fully unroll loops with constant trip count.
5489 // Unroll by the unroll factor with a fallback epilog for the remainder
5490 // iterations if necessary.
5492 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5494 InnerLoop,
5495 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5497 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5498
5499#ifndef NDEBUG
5500 (*UnrolledCLI)->assertOK();
5501#endif
5502}
5503
5506 llvm::Value *BufSize, llvm::Value *CpyBuf,
5507 llvm::Value *CpyFn, llvm::Value *DidIt) {
5508 if (!updateToLocation(Loc))
5509 return Loc.IP;
5510
5511 uint32_t SrcLocStrSize;
5512 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5513 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5514 Value *ThreadId = getOrCreateThreadID(Ident);
5515
5516 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5517
5518 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5519
5520 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5521 Builder.CreateCall(Fn, Args);
5522
5523 return Builder.saveIP();
5524}
5525
5527 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5528 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5530
5531 if (!updateToLocation(Loc))
5532 return Loc.IP;
5533
5534 // If needed allocate and initialize `DidIt` with 0.
5535 // DidIt: flag variable: 1=single thread; 0=not single thread.
5536 llvm::Value *DidIt = nullptr;
5537 if (!CPVars.empty()) {
5540 }
5541
5542 Directive OMPD = Directive::OMPD_single;
5543 uint32_t SrcLocStrSize;
5544 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5545 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5546 Value *ThreadId = getOrCreateThreadID(Ident);
5547 Value *Args[] = {Ident, ThreadId};
5548
5549 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5550 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5551
5552 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5553 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5554
5555 auto FiniCBWrapper = [&](InsertPointTy IP) {
5556 FiniCB(IP);
5557
5558 // The thread that executes the single region must set `DidIt` to 1.
5559 // This is used by __kmpc_copyprivate, to know if the caller is the
5560 // single thread or not.
5561 if (DidIt)
5563 };
5564
5565 // generates the following:
5566 // if (__kmpc_single()) {
5567 // .... single region ...
5568 // __kmpc_end_single
5569 // }
5570 // __kmpc_copyprivate
5571 // __kmpc_barrier
5572
5573 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5574 /*Conditional*/ true,
5575 /*hasFinalize*/ true);
5576
5577 if (DidIt) {
5578 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5579 // NOTE BufSize is currently unused, so just pass 0.
5581 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5582 CPFuncs[I], DidIt);
5583 // NOTE __kmpc_copyprivate already inserts a barrier
5584 } else if (!IsNowait)
5586 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5587 /* CheckCancelFlag */ false);
5588 return Builder.saveIP();
5589}
5590
5592 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5593 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5594
5595 if (!updateToLocation(Loc))
5596 return Loc.IP;
5597
5598 Directive OMPD = Directive::OMPD_critical;
5599 uint32_t SrcLocStrSize;
5600 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5601 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5602 Value *ThreadId = getOrCreateThreadID(Ident);
5603 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5604 Value *Args[] = {Ident, ThreadId, LockVar};
5605
5606 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5607 Function *RTFn = nullptr;
5608 if (HintInst) {
5609 // Add Hint to entry Args and create call
5610 EnterArgs.push_back(HintInst);
5611 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5612 } else {
5613 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5614 }
5615 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5616
5617 Function *ExitRTLFn =
5618 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5619 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5620
5621 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5622 /*Conditional*/ false, /*hasFinalize*/ true);
5623}
5624
5627 InsertPointTy AllocaIP, unsigned NumLoops,
5628 ArrayRef<llvm::Value *> StoreValues,
5629 const Twine &Name, bool IsDependSource) {
5630 assert(
5631 llvm::all_of(StoreValues,
5632 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5633 "OpenMP runtime requires depend vec with i64 type");
5634
5635 if (!updateToLocation(Loc))
5636 return Loc.IP;
5637
5638 // Allocate space for vector and generate alloc instruction.
5639 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5640 Builder.restoreIP(AllocaIP);
5641 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5642 ArgsBase->setAlignment(Align(8));
5643 Builder.restoreIP(Loc.IP);
5644
5645 // Store the index value with offset in depend vector.
5646 for (unsigned I = 0; I < NumLoops; ++I) {
5647 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5648 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5649 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5650 STInst->setAlignment(Align(8));
5651 }
5652
5653 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5654 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5655
5656 uint32_t SrcLocStrSize;
5657 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5658 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5659 Value *ThreadId = getOrCreateThreadID(Ident);
5660 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5661
5662 Function *RTLFn = nullptr;
5663 if (IsDependSource)
5664 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5665 else
5666 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5667 Builder.CreateCall(RTLFn, Args);
5668
5669 return Builder.saveIP();
5670}
5671
5673 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5674 FinalizeCallbackTy FiniCB, bool IsThreads) {
5675 if (!updateToLocation(Loc))
5676 return Loc.IP;
5677
5678 Directive OMPD = Directive::OMPD_ordered;
5679 Instruction *EntryCall = nullptr;
5680 Instruction *ExitCall = nullptr;
5681
5682 if (IsThreads) {
5683 uint32_t SrcLocStrSize;
5684 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5685 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5686 Value *ThreadId = getOrCreateThreadID(Ident);
5687 Value *Args[] = {Ident, ThreadId};
5688
5689 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5690 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5691
5692 Function *ExitRTLFn =
5693 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5694 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5695 }
5696
5697 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5698 /*Conditional*/ false, /*hasFinalize*/ true);
5699}
5700
5701OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5702 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5703 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5704 bool HasFinalize, bool IsCancellable) {
5705
5706 if (HasFinalize)
5707 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5708
5709 // Create inlined region's entry and body blocks, in preparation
5710 // for conditional creation
5711 BasicBlock *EntryBB = Builder.GetInsertBlock();
5712 Instruction *SplitPos = EntryBB->getTerminator();
5713 if (!isa_and_nonnull<BranchInst>(SplitPos))
5714 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5715 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5716 BasicBlock *FiniBB =
5717 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5718
5720 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5721
5722 // generate body
5723 BodyGenCB(/* AllocaIP */ InsertPointTy(),
5724 /* CodeGenIP */ Builder.saveIP());
5725
5726 // emit exit call and do any needed finalization.
5727 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5728 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5729 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5730 "Unexpected control flow graph state!!");
5731 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5732 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5733 "Unexpected Control Flow State!");
5735
5736 // If we are skipping the region of a non conditional, remove the exit
5737 // block, and clear the builder's insertion point.
5738 assert(SplitPos->getParent() == ExitBB &&
5739 "Unexpected Insertion point location!");
5740 auto merged = MergeBlockIntoPredecessor(ExitBB);
5741 BasicBlock *ExitPredBB = SplitPos->getParent();
5742 auto InsertBB = merged ? ExitPredBB : ExitBB;
5743 if (!isa_and_nonnull<BranchInst>(SplitPos))
5744 SplitPos->eraseFromParent();
5745 Builder.SetInsertPoint(InsertBB);
5746
5747 return Builder.saveIP();
5748}
5749
5750OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5751 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5752 // if nothing to do, Return current insertion point.
5753 if (!Conditional || !EntryCall)
5754 return Builder.saveIP();
5755
5756 BasicBlock *EntryBB = Builder.GetInsertBlock();
5757 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5758 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5759 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5760
5761 // Emit thenBB and set the Builder's insertion point there for
5762 // body generation next. Place the block after the current block.
5763 Function *CurFn = EntryBB->getParent();
5764 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5765
5766 // Move Entry branch to end of ThenBB, and replace with conditional
5767 // branch (If-stmt)
5768 Instruction *EntryBBTI = EntryBB->getTerminator();
5769 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5770 EntryBBTI->removeFromParent();
5772 Builder.Insert(EntryBBTI);
5773 UI->eraseFromParent();
5775
5776 // return an insertion point to ExitBB.
5777 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5778}
5779
5780OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
5781 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5782 bool HasFinalize) {
5783
5784 Builder.restoreIP(FinIP);
5785
5786 // If there is finalization to do, emit it before the exit call
5787 if (HasFinalize) {
5788 assert(!FinalizationStack.empty() &&
5789 "Unexpected finalization stack state!");
5790
5791 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5792 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5793
5794 Fi.FiniCB(FinIP);
5795
5796 BasicBlock *FiniBB = FinIP.getBlock();
5797 Instruction *FiniBBTI = FiniBB->getTerminator();
5798
5799 // set Builder IP for call creation
5800 Builder.SetInsertPoint(FiniBBTI);
5801 }
5802
5803 if (!ExitCall)
5804 return Builder.saveIP();
5805
5806 // place the Exitcall as last instruction before Finalization block terminator
5807 ExitCall->removeFromParent();
5808 Builder.Insert(ExitCall);
5809
5810 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5811 ExitCall->getIterator());
5812}
5813
5815 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5816 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5817 if (!IP.isSet())
5818 return IP;
5819
5821
5822 // creates the following CFG structure
5823 // OMP_Entry : (MasterAddr != PrivateAddr)?
5824 // F T
5825 // | \
5826 // | copin.not.master
5827 // | /
5828 // v /
5829 // copyin.not.master.end
5830 // |
5831 // v
5832 // OMP.Entry.Next
5833
5834 BasicBlock *OMP_Entry = IP.getBlock();
5835 Function *CurFn = OMP_Entry->getParent();
5836 BasicBlock *CopyBegin =
5837 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5838 BasicBlock *CopyEnd = nullptr;
5839
5840 // If entry block is terminated, split to preserve the branch to following
5841 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5842 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5843 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5844 "copyin.not.master.end");
5845 OMP_Entry->getTerminator()->eraseFromParent();
5846 } else {
5847 CopyEnd =
5848 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5849 }
5850
5851 Builder.SetInsertPoint(OMP_Entry);
5852 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5853 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5854 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5855 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5856
5857 Builder.SetInsertPoint(CopyBegin);
5858 if (BranchtoEnd)
5860
5861 return Builder.saveIP();
5862}
5863
5865 Value *Size, Value *Allocator,
5866 std::string Name) {
5868 updateToLocation(Loc);
5869
5870 uint32_t SrcLocStrSize;
5871 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5872 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5873 Value *ThreadId = getOrCreateThreadID(Ident);
5874 Value *Args[] = {ThreadId, Size, Allocator};
5875
5876 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
5877
5878 return Builder.CreateCall(Fn, Args, Name);
5879}
5880
5882 Value *Addr, Value *Allocator,
5883 std::string Name) {
5885 updateToLocation(Loc);
5886
5887 uint32_t SrcLocStrSize;
5888 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5889 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5890 Value *ThreadId = getOrCreateThreadID(Ident);
5891 Value *Args[] = {ThreadId, Addr, Allocator};
5892 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
5893 return Builder.CreateCall(Fn, Args, Name);
5894}
5895
5897 const LocationDescription &Loc, Value *InteropVar,
5898 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
5899 Value *DependenceAddress, bool HaveNowaitClause) {
5901 updateToLocation(Loc);
5902
5903 uint32_t SrcLocStrSize;
5904 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5905 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5906 Value *ThreadId = getOrCreateThreadID(Ident);
5907 if (Device == nullptr)
5908 Device = ConstantInt::get(Int32, -1);
5909 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
5910 if (NumDependences == nullptr) {
5911 NumDependences = ConstantInt::get(Int32, 0);
5912 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5913 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5914 }
5915 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5916 Value *Args[] = {
5917 Ident, ThreadId, InteropVar, InteropTypeVal,
5918 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
5919
5920 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
5921
5922 return Builder.CreateCall(Fn, Args);
5923}
5924
5926 const LocationDescription &Loc, Value *InteropVar, Value *Device,
5927 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
5929 updateToLocation(Loc);
5930
5931 uint32_t SrcLocStrSize;
5932 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5933 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5934 Value *ThreadId = getOrCreateThreadID(Ident);
5935 if (Device == nullptr)
5936 Device = ConstantInt::get(Int32, -1);
5937 if (NumDependences == nullptr) {
5938 NumDependences = ConstantInt::get(Int32, 0);
5939 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5940 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5941 }
5942 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5943 Value *Args[] = {
5944 Ident, ThreadId, InteropVar, Device,
5945 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5946
5947 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
5948
5949 return Builder.CreateCall(Fn, Args);
5950}
5951
5953 Value *InteropVar, Value *Device,
5954 Value *NumDependences,
5955 Value *DependenceAddress,
5956 bool HaveNowaitClause) {
5958 updateToLocation(Loc);
5959 uint32_t SrcLocStrSize;
5960 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5961 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5962 Value *ThreadId = getOrCreateThreadID(Ident);
5963 if (Device == nullptr)
5964 Device = ConstantInt::get(Int32, -1);
5965 if (NumDependences == nullptr) {
5966 NumDependences = ConstantInt::get(Int32, 0);
5967 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5968 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5969 }
5970 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5971 Value *Args[] = {
5972 Ident, ThreadId, InteropVar, Device,
5973 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5974
5975 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
5976
5977 return Builder.CreateCall(Fn, Args);
5978}
5979
5981 const LocationDescription &Loc, llvm::Value *Pointer,
5984 updateToLocation(Loc);
5985
5986 uint32_t SrcLocStrSize;
5987 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5988 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5989 Value *ThreadId = getOrCreateThreadID(Ident);
5990 Constant *ThreadPrivateCache =
5991 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
5992 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
5993
5994 Function *Fn =
5995 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
5996
5997 return Builder.CreateCall(Fn, Args);
5998}
5999
6002 int32_t MinThreadsVal, int32_t MaxThreadsVal,
6003 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
6004 if (!updateToLocation(Loc))
6005 return Loc.IP;
6006
6007 uint32_t SrcLocStrSize;
6008 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6009 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6010 Constant *IsSPMDVal = ConstantInt::getSigned(
6012 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
6013 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6014 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6015
6016 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6017 Function *Kernel = DebugKernelWrapper;
6018
6019 // We need to strip the debug prefix to get the correct kernel name.
6020 StringRef KernelName = Kernel->getName();
6021 const std::string DebugPrefix = "_debug__";
6022 if (KernelName.ends_with(DebugPrefix)) {
6023 KernelName = KernelName.drop_back(DebugPrefix.length());
6024 Kernel = M.getFunction(KernelName);
6025 assert(Kernel && "Expected the real kernel to exist");
6026 }
6027
6028 // Manifest the launch configuration in the metadata matching the kernel
6029 // environment.
6030 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
6031 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
6032
6033 // For max values, < 0 means unset, == 0 means set but unknown.
6034 if (MaxThreadsVal < 0)
6035 MaxThreadsVal = std::max(
6036 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
6037
6038 if (MaxThreadsVal > 0)
6039 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
6040
6041 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
6043 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
6044 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
6045 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6046 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6047
6049 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6050 const DataLayout &DL = Fn->getDataLayout();
6051
6052 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6053 Constant *DynamicEnvironmentInitializer =
6054 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6055 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6056 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6057 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6058 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6059 DL.getDefaultGlobalsAddressSpace());
6060 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6061
6062 Constant *DynamicEnvironment =
6063 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6064 ? DynamicEnvironmentGV
6065 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6066 DynamicEnvironmentPtr);
6067
6068 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6069 ConfigurationEnvironment, {
6070 UseGenericStateMachineVal,
6071 MayUseNestedParallelismVal,
6072 IsSPMDVal,
6073 MinThreads,
6074 MaxThreads,
6075 MinTeams,
6076 MaxTeams,
6077 ReductionDataSize,
6078 ReductionBufferLength,
6079 });
6080 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6081 KernelEnvironment, {
6082 ConfigurationEnvironmentInitializer,
6083 Ident,
6084 DynamicEnvironment,
6085 });
6086 std::string KernelEnvironmentName =
6087 (KernelName + "_kernel_environment").str();
6088 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6089 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6090 KernelEnvironmentInitializer, KernelEnvironmentName,
6091 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6092 DL.getDefaultGlobalsAddressSpace());
6093 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6094
6095 Constant *KernelEnvironment =
6096 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6097 ? KernelEnvironmentGV
6098 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6099 KernelEnvironmentPtr);
6100 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6101 CallInst *ThreadKind =
6102 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6103
6104 Value *ExecUserCode = Builder.CreateICmpEQ(
6105 ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
6106 "exec_user_code");
6107
6108 // ThreadKind = __kmpc_target_init(...)
6109 // if (ThreadKind == -1)
6110 // user_code
6111 // else
6112 // return;
6113
6114 auto *UI = Builder.CreateUnreachable();
6115 BasicBlock *CheckBB = UI->getParent();
6116 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6117
6118 BasicBlock *WorkerExitBB = BasicBlock::Create(
6119 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6120 Builder.SetInsertPoint(WorkerExitBB);
6122
6123 auto *CheckBBTI = CheckBB->getTerminator();
6124 Builder.SetInsertPoint(CheckBBTI);
6125 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6126
6127 CheckBBTI->eraseFromParent();
6128 UI->eraseFromParent();
6129
6130 // Continue in the "user_code" block, see diagram above and in
6131 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6132 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6133}
6134
6136 int32_t TeamsReductionDataSize,
6137 int32_t TeamsReductionBufferLength) {
6138 if (!updateToLocation(Loc))
6139 return;
6140
6142 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6143
6144 Builder.CreateCall(Fn, {});
6145
6146 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6147 return;
6148
6150 // We need to strip the debug prefix to get the correct kernel name.
6151 StringRef KernelName = Kernel->getName();
6152 const std::string DebugPrefix = "_debug__";
6153 if (KernelName.ends_with(DebugPrefix))
6154 KernelName = KernelName.drop_back(DebugPrefix.length());
6155 auto *KernelEnvironmentGV =
6156 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6157 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6158 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6159 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6160 KernelEnvironmentInitializer,
6161 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6162 NewInitializer = ConstantFoldInsertValueInstruction(
6163 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6164 {0, 8});
6165 KernelEnvironmentGV->setInitializer(NewInitializer);
6166}
6167
6169 Module &M = *Kernel.getParent();
6170 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6171 for (auto *Op : MD->operands()) {
6172 if (Op->getNumOperands() != 3)
6173 continue;
6174 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6175 if (!KernelOp || KernelOp->getValue() != &Kernel)
6176 continue;
6177 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6178 if (!Prop || Prop->getString() != Name)
6179 continue;
6180 return Op;
6181 }
6182 return nullptr;
6183}
6184
6186 bool Min) {
6187 // Update the "maxntidx" metadata for NVIDIA, or add it.
6188 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6189 if (ExistingOp) {
6190 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6191 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6192 ExistingOp->replaceOperandWith(
6193 2, ConstantAsMetadata::get(ConstantInt::get(
6194 OldVal->getValue()->getType(),
6195 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6196 } else {
6197 LLVMContext &Ctx = Kernel.getContext();
6199 MDString::get(Ctx, Name),
6201 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6202 // Append metadata to nvvm.annotations
6203 Module &M = *Kernel.getParent();
6204 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6205 MD->addOperand(MDNode::get(Ctx, MDVals));
6206 }
6207}
6208
6209std::pair<int32_t, int32_t>
6211 int32_t ThreadLimit =
6212 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6213
6214 if (T.isAMDGPU()) {
6215 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6216 if (!Attr.isValid() || !Attr.isStringAttribute())
6217 return {0, ThreadLimit};
6218 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6219 int32_t LB, UB;
6220 if (!llvm::to_integer(UBStr, UB, 10))
6221 return {0, ThreadLimit};
6222 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6223 if (!llvm::to_integer(LBStr, LB, 10))
6224 return {0, UB};
6225 return {LB, UB};
6226 }
6227
6228 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6229 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6230 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6231 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6232 }
6233 return {0, ThreadLimit};
6234}
6235
6237 Function &Kernel, int32_t LB,
6238 int32_t UB) {
6239 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6240
6241 if (T.isAMDGPU()) {
6242 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6243 llvm::utostr(LB) + "," + llvm::utostr(UB));
6244 return;
6245 }
6246
6247 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6248}
6249
6250std::pair<int32_t, int32_t>
6252 // TODO: Read from backend annotations if available.
6253 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6254}
6255
6257 int32_t LB, int32_t UB) {
6258 if (T.isNVPTX())
6259 if (UB > 0)
6260 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6261 if (T.isAMDGPU())
6262 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6263
6264 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6265}
6266
6267void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6268 Function *OutlinedFn) {
6269 if (Config.isTargetDevice()) {
6271 // TODO: Determine if DSO local can be set to true.
6272 OutlinedFn->setDSOLocal(false);
6274 if (T.isAMDGCN())
6276 }
6277}
6278
6279Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6280 StringRef EntryFnIDName) {
6281 if (Config.isTargetDevice()) {
6282 assert(OutlinedFn && "The outlined function must exist if embedded");
6283 return OutlinedFn;
6284 }
6285
6286 return new GlobalVariable(
6287 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6288 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6289}
6290
6291Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6292 StringRef EntryFnName) {
6293 if (OutlinedFn)
6294 return OutlinedFn;
6295
6296 assert(!M.getGlobalVariable(EntryFnName, true) &&
6297 "Named kernel already exists?");
6298 return new GlobalVariable(
6299 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6300 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6301}
6302
6304 TargetRegionEntryInfo &EntryInfo,
6305 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6306 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6307
6308 SmallString<64> EntryFnName;
6309 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6310
6312 ? GenerateFunctionCallback(EntryFnName)
6313 : nullptr;
6314
6315 // If this target outline function is not an offload entry, we don't need to
6316 // register it. This may be in the case of a false if clause, or if there are
6317 // no OpenMP targets.
6318 if (!IsOffloadEntry)
6319 return;
6320
6321 std::string EntryFnIDName =
6323 ? std::string(EntryFnName)
6324 : createPlatformSpecificName({EntryFnName, "region_id"});
6325
6326 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6327 EntryFnName, EntryFnIDName);
6328}
6329
6331 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6332 StringRef EntryFnName, StringRef EntryFnIDName) {
6333 if (OutlinedFn)
6334 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6335 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6336 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6338 EntryInfo, EntryAddr, OutlinedFnID,
6340 return OutlinedFnID;
6341}
6342
6344 const LocationDescription &Loc, InsertPointTy AllocaIP,
6345 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6346 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6347 omp::RuntimeFunction *MapperFunc,
6348 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
6349 BodyGenCB,
6350 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6351 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6352 if (!updateToLocation(Loc))
6353 return InsertPointTy();
6354
6355 // Disable TargetData CodeGen on Device pass.
6356 if (Config.IsTargetDevice.value_or(false)) {
6357 if (BodyGenCB)
6359 return Builder.saveIP();
6360 }
6361
6362 Builder.restoreIP(CodeGenIP);
6363 bool IsStandAlone = !BodyGenCB;
6364 MapInfosTy *MapInfo;
6365 // Generate the code for the opening of the data environment. Capture all the
6366 // arguments of the runtime call by reference because they are used in the
6367 // closing of the region.
6368 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6369 MapInfo = &GenMapInfoCB(Builder.saveIP());
6370 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6371 /*IsNonContiguous=*/true, DeviceAddrCB,
6372 CustomMapperCB);
6373
6374 TargetDataRTArgs RTArgs;
6376 !MapInfo->Names.empty());
6377
6378 // Emit the number of elements in the offloading arrays.
6379 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6380
6381 // Source location for the ident struct
6382 if (!SrcLocInfo) {
6383 uint32_t SrcLocStrSize;
6384 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6385 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6386 }
6387
6388 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6389 PointerNum, RTArgs.BasePointersArray,
6390 RTArgs.PointersArray, RTArgs.SizesArray,
6391 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6392 RTArgs.MappersArray};
6393
6394 if (IsStandAlone) {
6395 assert(MapperFunc && "MapperFunc missing for standalone target data");
6397 OffloadingArgs);
6398 } else {
6399 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6400 omp::OMPRTL___tgt_target_data_begin_mapper);
6401
6402 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6403
6404 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6405 if (isa<AllocaInst>(DeviceMap.second.second)) {
6406 auto *LI =
6407 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6408 Builder.CreateStore(LI, DeviceMap.second.second);
6409 }
6410 }
6411
6412 // If device pointer privatization is required, emit the body of the
6413 // region here. It will have to be duplicated: with and without
6414 // privatization.
6416 }
6417 };
6418
6419 // If we need device pointer privatization, we need to emit the body of the
6420 // region with no privatization in the 'else' branch of the conditional.
6421 // Otherwise, we don't have to do anything.
6422 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6424 };
6425
6426 // Generate code for the closing of the data region.
6427 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6428 TargetDataRTArgs RTArgs;
6429 emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
6430 /*ForEndCall=*/true);
6431
6432 // Emit the number of elements in the offloading arrays.
6433 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6434
6435 // Source location for the ident struct
6436 if (!SrcLocInfo) {
6437 uint32_t SrcLocStrSize;
6438 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6439 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6440 }
6441
6442 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6443 PointerNum, RTArgs.BasePointersArray,
6444 RTArgs.PointersArray, RTArgs.SizesArray,
6445 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6446 RTArgs.MappersArray};
6447 Function *EndMapperFunc =
6448 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6449
6450 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6451 };
6452
6453 // We don't have to do anything to close the region if the if clause evaluates
6454 // to false.
6455 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
6456
6457 if (BodyGenCB) {
6458 if (IfCond) {
6459 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6460 } else {
6461 BeginThenGen(AllocaIP, Builder.saveIP());
6462 }
6463
6464 // If we don't require privatization of device pointers, we emit the body in
6465 // between the runtime calls. This avoids duplicating the body code.
6467
6468 if (IfCond) {
6469 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6470 } else {
6471 EndThenGen(AllocaIP, Builder.saveIP());
6472 }
6473 } else {
6474 if (IfCond) {
6475 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6476 } else {
6477 BeginThenGen(AllocaIP, Builder.saveIP());
6478 }
6479 }
6480
6481 return Builder.saveIP();
6482}
6483
6486 bool IsGPUDistribute) {
6487 assert((IVSize == 32 || IVSize == 64) &&
6488 "IV size is not compatible with the omp runtime");
6490 if (IsGPUDistribute)
6491 Name = IVSize == 32
6492 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6493 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6494 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6495 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6496 else
6497 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6498 : omp::OMPRTL___kmpc_for_static_init_4u)
6499 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6500 : omp::OMPRTL___kmpc_for_static_init_8u);
6501
6503}
6504
6506 bool IVSigned) {
6507 assert((IVSize == 32 || IVSize == 64) &&
6508 "IV size is not compatible with the omp runtime");
6509 RuntimeFunction Name = IVSize == 32
6510 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6511 : omp::OMPRTL___kmpc_dispatch_init_4u)
6512 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6513 : omp::OMPRTL___kmpc_dispatch_init_8u);
6514
6516}
6517
6519 bool IVSigned) {
6520 assert((IVSize == 32 || IVSize == 64) &&
6521 "IV size is not compatible with the omp runtime");
6522 RuntimeFunction Name = IVSize == 32
6523 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6524 : omp::OMPRTL___kmpc_dispatch_next_4u)
6525 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6526 : omp::OMPRTL___kmpc_dispatch_next_8u);
6527
6529}
6530
6532 bool IVSigned) {
6533 assert((IVSize == 32 || IVSize == 64) &&
6534 "IV size is not compatible with the omp runtime");
6535 RuntimeFunction Name = IVSize == 32
6536 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6537 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6538 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6539 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6540
6542}
6543
6545 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6546}
6547
6549 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6553 SmallVector<Type *> ParameterTypes;
6554 if (OMPBuilder.Config.isTargetDevice()) {
6555 // Add the "implicit" runtime argument we use to provide launch specific
6556 // information for target devices.
6557 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6558 ParameterTypes.push_back(Int8PtrTy);
6559
6560 // All parameters to target devices are passed as pointers
6561 // or i64. This assumes 64-bit address spaces/pointers.
6562 for (auto &Arg : Inputs)
6563 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6564 ? Arg->getType()
6565 : Type::getInt64Ty(Builder.getContext()));
6566 } else {
6567 for (auto &Arg : Inputs)
6568 ParameterTypes.push_back(Arg->getType());
6569 }
6570
6571 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6572 /*isVarArg*/ false);
6573 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
6574 Builder.GetInsertBlock()->getModule());
6575
6576 // Save insert point.
6577 auto OldInsertPoint = Builder.saveIP();
6578
6579 // Generate the region into the function.
6580 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6581 Builder.SetInsertPoint(EntryBB);
6582
6583 // Insert target init call in the device compilation pass.
6584 if (OMPBuilder.Config.isTargetDevice())
6585 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6586
6587 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6588
6589 // As we embed the user code in the middle of our target region after we
6590 // generate entry code, we must move what allocas we can into the entry
6591 // block to avoid possible breaking optimisations for device
6592 if (OMPBuilder.Config.isTargetDevice())
6594
6595 // Insert target deinit call in the device compilation pass.
6596 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
6597 if (OMPBuilder.Config.isTargetDevice())
6598 OMPBuilder.createTargetDeinit(Builder);
6599
6600 // Insert return instruction.
6601 Builder.CreateRetVoid();
6602
6603 // New Alloca IP at entry point of created device function.
6604 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6605 auto AllocaIP = Builder.saveIP();
6606
6607 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6608
6609 // Skip the artificial dyn_ptr on the device.
6610 const auto &ArgRange =
6611 OMPBuilder.Config.isTargetDevice()
6612 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6613 : Func->args();
6614
6615 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6616 // Things like GEP's can come in the form of Constants. Constants and
6617 // ConstantExpr's do not have access to the knowledge of what they're
6618 // contained in, so we must dig a little to find an instruction so we
6619 // can tell if they're used inside of the function we're outlining. We
6620 // also replace the original constant expression with a new instruction
6621 // equivalent; an instruction as it allows easy modification in the
6622 // following loop, as we can now know the constant (instruction) is
6623 // owned by our target function and replaceUsesOfWith can now be invoked
6624 // on it (cannot do this with constants it seems). A brand new one also
6625 // allows us to be cautious as it is perhaps possible the old expression
6626 // was used inside of the function but exists and is used externally
6627 // (unlikely by the nature of a Constant, but still).
6628 // NOTE: We cannot remove dead constants that have been rewritten to
6629 // instructions at this stage, we run the risk of breaking later lowering
6630 // by doing so as we could still be in the process of lowering the module
6631 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6632 // constants we have created rewritten versions of.
6633 if (auto *Const = dyn_cast<Constant>(Input))
6634 convertUsersOfConstantsToInstructions(Const, Func, false);
6635
6636 // Collect all the instructions
6637 for (User *User : make_early_inc_range(Input->users()))
6638 if (auto *Instr = dyn_cast<Instruction>(User))
6639 if (Instr->getFunction() == Func)
6640 Instr->replaceUsesOfWith(Input, InputCopy);
6641 };
6642
6643 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6644
6645 // Rewrite uses of input valus to parameters.
6646 for (auto InArg : zip(Inputs, ArgRange)) {
6647 Value *Input = std::get<0>(InArg);
6648 Argument &Arg = std::get<1>(InArg);
6649 Value *InputCopy = nullptr;
6650
6651 Builder.restoreIP(
6652 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
6653
6654 // In certain cases a Global may be set up for replacement, however, this
6655 // Global may be used in multiple arguments to the kernel, just segmented
6656 // apart, for example, if we have a global array, that is sectioned into
6657 // multiple mappings (technically not legal in OpenMP, but there is a case
6658 // in Fortran for Common Blocks where this is neccesary), we will end up
6659 // with GEP's into this array inside the kernel, that refer to the Global
6660 // but are technically seperate arguments to the kernel for all intents and
6661 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6662 // index, it will fold into an referal to the Global, if we then encounter
6663 // this folded GEP during replacement all of the references to the
6664 // Global in the kernel will be replaced with the argument we have generated
6665 // that corresponds to it, including any other GEP's that refer to the
6666 // Global that may be other arguments. This will invalidate all of the other
6667 // preceding mapped arguments that refer to the same global that may be
6668 // seperate segments. To prevent this, we defer global processing until all
6669 // other processing has been performed.
6670 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6671 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6672 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6673 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6674 continue;
6675 }
6676
6677 ReplaceValue(Input, InputCopy, Func);
6678 }
6679
6680 // Replace all of our deferred Input values, currently just Globals.
6681 for (auto Deferred : DeferredReplacement)
6682 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6683
6684 // Restore insert point.
6685 Builder.restoreIP(OldInsertPoint);
6686
6687 return Func;
6688}
6689
6690/// Create an entry point for a target task with the following.
6691/// It'll have the following signature
6692/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6693/// This function is called from emitTargetTask once the
6694/// code to launch the target kernel has been outlined already.
6696 IRBuilderBase &Builder,
6697 CallInst *StaleCI) {
6698 Module &M = OMPBuilder.M;
6699 // KernelLaunchFunction is the target launch function, i.e.
6700 // the function that sets up kernel arguments and calls
6701 // __tgt_target_kernel to launch the kernel on the device.
6702 //
6703 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6704
6705 // StaleCI is the CallInst which is the call to the outlined
6706 // target kernel launch function. If there are values that the
6707 // outlined function uses then these are aggregated into a structure
6708 // which is passed as the second argument. If not, then there's
6709 // only one argument, the threadID. So, StaleCI can be
6710 //
6711 // %structArg = alloca { ptr, ptr }, align 8
6712 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6713 // store ptr %20, ptr %gep_, align 8
6714 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6715 // store ptr %21, ptr %gep_8, align 8
6716 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6717 //
6718 // OR
6719 //
6720 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6722 StaleCI->getIterator());
6723 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6724 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6725 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6726 Type *TaskTy = OMPBuilder.Task;
6727 auto ProxyFnTy =
6728 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6729 /* isVarArg */ false);
6730 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6731 ".omp_target_task_proxy_func",
6732 Builder.GetInsertBlock()->getModule());
6733 ProxyFn->getArg(0)->setName("thread.id");
6734 ProxyFn->getArg(1)->setName("task");
6735
6736 BasicBlock *EntryBB =
6737 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6738 Builder.SetInsertPoint(EntryBB);
6739
6740 bool HasShareds = StaleCI->arg_size() > 1;
6741 // TODO: This is a temporary assert to prove to ourselves that
6742 // the outlined target launch function is always going to have
6743 // atmost two arguments if there is any data shared between
6744 // host and device.
6745 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
6746 "StaleCI with shareds should have exactly two arguments.");
6747 if (HasShareds) {
6748 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6749 assert(ArgStructAlloca &&
6750 "Unable to find the alloca instruction corresponding to arguments "
6751 "for extracted function");
6752 auto *ArgStructType =
6753 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
6754
6755 AllocaInst *NewArgStructAlloca =
6756 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
6757 Value *TaskT = ProxyFn->getArg(1);
6758 Value *ThreadId = ProxyFn->getArg(0);
6759 Value *SharedsSize =
6760 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
6761
6762 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
6763 LoadInst *LoadShared =
6764 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
6765
6766 Builder.CreateMemCpy(
6767 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
6768 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
6769
6770 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
6771 }
6772 Builder.CreateRetVoid();
6773 return ProxyFn;
6774}
6776 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6777 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
6778 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
6781
6782 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
6783 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
6784 &ArgAccessorFuncCB](StringRef EntryFnName) {
6785 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
6786 CBFunc, ArgAccessorFuncCB);
6787 };
6788
6789 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
6790 OutlinedFn, OutlinedFnID);
6791}
6793 Function *OutlinedFn, Value *OutlinedFnID,
6794 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
6795 Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
6797 bool HasNoWait) {
6798
6799 // When we arrive at this function, the target region itself has been
6800 // outlined into the function OutlinedFn.
6801 // So at ths point, for
6802 // --------------------------------------------------
6803 // void user_code_that_offloads(...) {
6804 // omp target depend(..) map(from:a) map(to:b, c)
6805 // a = b + c
6806 // }
6807 //
6808 // --------------------------------------------------
6809 //
6810 // we have
6811 //
6812 // --------------------------------------------------
6813 //
6814 // void user_code_that_offloads(...) {
6815 // %.offload_baseptrs = alloca [3 x ptr], align 8
6816 // %.offload_ptrs = alloca [3 x ptr], align 8
6817 // %.offload_mappers = alloca [3 x ptr], align 8
6818 // ;; target region has been outlined and now we need to
6819 // ;; offload to it via a target task.
6820 // }
6821 // void outlined_device_function(ptr a, ptr b, ptr c) {
6822 // *a = *b + *c
6823 // }
6824 //
6825 // We have to now do the following
6826 // (i) Make an offloading call to outlined_device_function using the OpenMP
6827 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
6828 // emitted by emitKernelLaunch
6829 // (ii) Create a task entry point function that calls kernel_launch_function
6830 // and is the entry point for the target task. See
6831 // '@.omp_target_task_proxy_func in the pseudocode below.
6832 // (iii) Create a task with the task entry point created in (ii)
6833 //
6834 // That is we create the following
6835 //
6836 // void user_code_that_offloads(...) {
6837 // %.offload_baseptrs = alloca [3 x ptr], align 8
6838 // %.offload_ptrs = alloca [3 x ptr], align 8
6839 // %.offload_mappers = alloca [3 x ptr], align 8
6840 //
6841 // %structArg = alloca { ptr, ptr, ptr }, align 8
6842 // %strucArg[0] = %.offload_baseptrs
6843 // %strucArg[1] = %.offload_ptrs
6844 // %strucArg[2] = %.offload_mappers
6845 // proxy_target_task = @__kmpc_omp_task_alloc(...,
6846 // @.omp_target_task_proxy_func)
6847 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
6848 // dependencies_array = ...
6849 // ;; if nowait not present
6850 // call @__kmpc_omp_wait_deps(..., dependencies_array)
6851 // call @__kmpc_omp_task_begin_if0(...)
6852 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
6853 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
6854 // }
6855 //
6856 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
6857 // ptr %task) {
6858 // %structArg = alloca {ptr, ptr, ptr}
6859 // %shared_data = load (getelementptr %task, 0, 0)
6860 // mempcy(%structArg, %shared_data, sizeof(structArg))
6861 // kernel_launch_function(%thread.id, %structArg)
6862 // }
6863 //
6864 // We need the proxy function because the signature of the task entry point
6865 // expected by kmpc_omp_task is always the same and will be different from
6866 // that of the kernel_launch function.
6867 //
6868 // kernel_launch_function is generated by emitKernelLaunch and has the
6869 // always_inline attribute.
6870 // void kernel_launch_function(thread_id,
6871 // structArg) alwaysinline {
6872 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
6873 // offload_baseptrs = load(getelementptr structArg, 0, 0)
6874 // offload_ptrs = load(getelementptr structArg, 0, 1)
6875 // offload_mappers = load(getelementptr structArg, 0, 2)
6876 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
6877 // ; offload_mappers
6878 // call i32 @__tgt_target_kernel(...,
6879 // outlined_device_function,
6880 // ptr %kernel_args)
6881 // }
6882 // void outlined_device_function(ptr a, ptr b, ptr c) {
6883 // *a = *b + *c
6884 // }
6885 //
6886 BasicBlock *TargetTaskBodyBB =
6887 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
6888 BasicBlock *TargetTaskAllocaBB =
6889 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
6890
6891 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
6892 TargetTaskAllocaBB->begin());
6893 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
6894
6895 OutlineInfo OI;
6896 OI.EntryBB = TargetTaskAllocaBB;
6897 OI.OuterAllocaBB = AllocaIP.getBlock();
6898
6899 // Add the thread ID argument.
6902 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
6903
6904 Builder.restoreIP(TargetTaskBodyIP);
6905
6906 // emitKernelLaunch makes the necessary runtime call to offload the kernel.
6907 // We then outline all that code into a separate function
6908 // ('kernel_launch_function' in the pseudo code above). This function is then
6909 // called by the target task proxy function (see
6910 // '@.omp_target_task_proxy_func' in the pseudo code above)
6911 // "@.omp_target_task_proxy_func' is generated by emitTargetTaskProxyFunction
6912 Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
6913 EmitTargetCallFallbackCB, Args, DeviceID,
6914 RTLoc, TargetTaskAllocaIP));
6915
6916 OI.ExitBB = Builder.saveIP().getBlock();
6917 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
6918 HasNoWait](Function &OutlinedFn) mutable {
6919 assert(OutlinedFn.getNumUses() == 1 &&
6920 "there must be a single user for the outlined function");
6921
6922 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
6923 bool HasShareds = StaleCI->arg_size() > 1;
6924
6925 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
6926
6927 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
6928 << "\n");
6929
6930 Builder.SetInsertPoint(StaleCI);
6931
6932 // Gather the arguments for emitting the runtime call.
6933 uint32_t SrcLocStrSize;
6934 Constant *SrcLocStr =
6936 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6937
6938 // @__kmpc_omp_task_alloc
6939 Function *TaskAllocFn =
6940 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
6941
6942 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
6943 // call.
6944 Value *ThreadID = getOrCreateThreadID(Ident);
6945
6946 // Argument - `sizeof_kmp_task_t` (TaskSize)
6947 // Tasksize refers to the size in bytes of kmp_task_t data structure
6948 // including private vars accessed in task.
6949 // TODO: add kmp_task_t_with_privates (privates)
6950 Value *TaskSize =
6952
6953 // Argument - `sizeof_shareds` (SharedsSize)
6954 // SharedsSize refers to the shareds array size in the kmp_task_t data
6955 // structure.
6956 Value *SharedsSize = Builder.getInt64(0);
6957 if (HasShareds) {
6958 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6959 assert(ArgStructAlloca &&
6960 "Unable to find the alloca instruction corresponding to arguments "
6961 "for extracted function");
6962 auto *ArgStructType =
6963 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
6964 assert(ArgStructType && "Unable to find struct type corresponding to "
6965 "arguments for extracted function");
6966 SharedsSize =
6968 }
6969
6970 // Argument - `flags`
6971 // Task is tied iff (Flags & 1) == 1.
6972 // Task is untied iff (Flags & 1) == 0.
6973 // Task is final iff (Flags & 2) == 2.
6974 // Task is not final iff (Flags & 2) == 0.
6975 // A target task is not final and is untied.
6977
6978 // Emit the @__kmpc_omp_task_alloc runtime call
6979 // The runtime call returns a pointer to an area where the task captured
6980 // variables must be copied before the task is run (TaskData)
6981 CallInst *TaskData = Builder.CreateCall(
6982 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
6983 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
6984 /*task_func=*/ProxyFn});
6985
6986 if (HasShareds) {
6987 Value *Shareds = StaleCI->getArgOperand(1);
6988 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
6989 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
6990 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
6991 SharedsSize);
6992 }
6993
6994 Value *DepArray = emitTaskDependencies(*this, Dependencies);
6995
6996 // ---------------------------------------------------------------
6997 // V5.2 13.8 target construct
6998 // If the nowait clause is present, execution of the target task
6999 // may be deferred. If the nowait clause is not present, the target task is
7000 // an included task.
7001 // ---------------------------------------------------------------
7002 // The above means that the lack of a nowait on the target construct
7003 // translates to '#pragma omp task if(0)'
7004 if (!HasNoWait) {
7005 if (DepArray) {
7006 Function *TaskWaitFn =
7007 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7009 TaskWaitFn,
7010 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7011 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7012 /*dep_list=*/DepArray,
7013 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7014 /*noalias_dep_list=*/
7016 }
7017 // Included task.
7018 Function *TaskBeginFn =
7019 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7020 Function *TaskCompleteFn =
7021 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7022 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7023 CallInst *CI = nullptr;
7024 if (HasShareds)
7025 CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7026 else
7027 CI = Builder.CreateCall(ProxyFn, {ThreadID});
7028 CI->setDebugLoc(StaleCI->getDebugLoc());
7029 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7030 } else if (DepArray) {
7031 // HasNoWait - meaning the task may be deferred. Call
7032 // __kmpc_omp_task_with_deps if there are dependencies,
7033 // else call __kmpc_omp_task
7034 Function *TaskFn =
7035 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7037 TaskFn,
7038 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7039 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7041 } else {
7042 // Emit the @__kmpc_omp_task runtime call to spawn the task
7043 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7044 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7045 }
7046
7047 StaleCI->eraseFromParent();
7048 llvm::for_each(llvm::reverse(ToBeDeleted),
7049 [](Instruction *I) { I->eraseFromParent(); });
7050 };
7051 addOutlineInfo(std::move(OI));
7052
7053 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7054 << *(Builder.GetInsertBlock()) << "\n");
7055 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7057 << "\n");
7058 return Builder.saveIP();
7059}
7060static void emitTargetCall(
7061 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7062 OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
7063 Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
7067
7069 /*RequiresDevicePointerInfo=*/false,
7070 /*SeparateBeginEndCalls=*/true);
7071
7072 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7073 OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
7074 /*IsNonContiguous=*/true);
7075
7077 OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
7078 !MapInfo.Names.empty());
7079
7080 // emitKernelLaunch
7081 auto &&EmitTargetCallFallbackCB =
7083 Builder.restoreIP(IP);
7084 Builder.CreateCall(OutlinedFn, Args);
7085 return Builder.saveIP();
7086 };
7087
7088 unsigned NumTargetItems = MapInfo.BasePointers.size();
7089 // TODO: Use correct device ID
7090 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7091 Value *NumTeamsVal = Builder.getInt32(NumTeams);
7092 Value *NumThreadsVal = Builder.getInt32(NumThreads);
7093 uint32_t SrcLocStrSize;
7094 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7095 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7096 llvm::omp::IdentFlag(0), 0);
7097 // TODO: Use correct NumIterations
7098 Value *NumIterations = Builder.getInt64(0);
7099 // TODO: Use correct DynCGGroupMem
7100 Value *DynCGGroupMem = Builder.getInt32(0);
7101
7102 bool HasNoWait = false;
7103 bool HasDependencies = Dependencies.size() > 0;
7104 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7105
7106 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
7107 NumTeamsVal, NumThreadsVal,
7108 DynCGGroupMem, HasNoWait);
7109
7110 // The presence of certain clauses on the target directive require the
7111 // explicit generation of the target task.
7112 if (RequiresOuterTargetTask) {
7113 Builder.restoreIP(OMPBuilder.emitTargetTask(
7114 OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
7115 RTLoc, AllocaIP, Dependencies, HasNoWait));
7116 } else {
7117 Builder.restoreIP(OMPBuilder.emitKernelLaunch(
7118 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
7119 DeviceID, RTLoc, AllocaIP));
7120 }
7121}
7123 const LocationDescription &Loc, InsertPointTy AllocaIP,
7124 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
7125 int32_t NumThreads, SmallVectorImpl<Value *> &Args,
7126 GenMapInfoCallbackTy GenMapInfoCB,
7129 SmallVector<DependData> Dependencies) {
7130
7131 if (!updateToLocation(Loc))
7132 return InsertPointTy();
7133
7134 Builder.restoreIP(CodeGenIP);
7135
7136 Function *OutlinedFn;
7137 Constant *OutlinedFnID;
7138 // The target region is outlined into its own function. The LLVM IR for
7139 // the target region itself is generated using the callbacks CBFunc
7140 // and ArgAccessorFuncCB
7141 emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
7142 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
7143
7144 // If we are not on the target device, then we need to generate code
7145 // to make a remote call (offload) to the previously outlined function
7146 // that represents the target region. Do that now.
7147 if (!Config.isTargetDevice())
7148 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
7149 NumThreads, Args, GenMapInfoCB, Dependencies);
7150 return Builder.saveIP();
7151}
7152
7153std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7154 StringRef FirstSeparator,
7155 StringRef Separator) {
7156 SmallString<128> Buffer;
7158 StringRef Sep = FirstSeparator;
7159 for (StringRef Part : Parts) {
7160 OS << Sep << Part;
7161 Sep = Separator;
7162 }
7163 return OS.str().str();
7164}
7165
7166std::string
7168 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7169 Config.separator());
7170}
7171
7174 unsigned AddressSpace) {
7175 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7176 if (Elem.second) {
7177 assert(Elem.second->getValueType() == Ty &&
7178 "OMP internal variable has different type than requested");
7179 } else {
7180 // TODO: investigate the appropriate linkage type used for the global
7181 // variable for possibly changing that to internal or private, or maybe
7182 // create different versions of the function for different OMP internal
7183 // variables.
7184 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7187 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7188 Constant::getNullValue(Ty), Elem.first(),
7189 /*InsertBefore=*/nullptr,
7191 const DataLayout &DL = M.getDataLayout();
7192 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7193 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7194 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7195 Elem.second = GV;
7196 }
7197
7198 return Elem.second;
7199}
7200
7201Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7202 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7203 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7204 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7205}
7206
7209 Value *Null =
7210 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7211 Value *SizeGep =
7212 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7213 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7214 return SizePtrToInt;
7215}
7216
7219 std::string VarName) {
7220 llvm::Constant *MaptypesArrayInit =
7222 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7223 M, MaptypesArrayInit->getType(),
7224 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7225 VarName);
7226 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7227 return MaptypesArrayGlobal;
7228}
7229
7231 InsertPointTy AllocaIP,
7232 unsigned NumOperands,
7233 struct MapperAllocas &MapperAllocas) {
7234 if (!updateToLocation(Loc))
7235 return;
7236
7237 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7238 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7239 Builder.restoreIP(AllocaIP);
7240 AllocaInst *ArgsBase = Builder.CreateAlloca(
7241 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7242 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7243 ".offload_ptrs");
7244 AllocaInst *ArgSizes = Builder.CreateAlloca(
7245 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7246 Builder.restoreIP(Loc.IP);
7247 MapperAllocas.ArgsBase = ArgsBase;
7248 MapperAllocas.Args = Args;
7249 MapperAllocas.ArgSizes = ArgSizes;
7250}
7251
7253 Function *MapperFunc, Value *SrcLocInfo,
7254 Value *MaptypesArg, Value *MapnamesArg,
7256 int64_t DeviceID, unsigned NumOperands) {
7257 if (!updateToLocation(Loc))
7258 return;
7259
7260 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7261 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7262 Value *ArgsBaseGEP =
7264 {Builder.getInt32(0), Builder.getInt32(0)});
7265 Value *ArgsGEP =
7267 {Builder.getInt32(0), Builder.getInt32(0)});
7268 Value *ArgSizesGEP =
7270 {Builder.getInt32(0), Builder.getInt32(0)});
7271 Value *NullPtr =
7272 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7273 Builder.CreateCall(MapperFunc,
7274 {SrcLocInfo, Builder.getInt64(DeviceID),
7275 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7276 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7277}
7278
7280 TargetDataRTArgs &RTArgs,
7281 TargetDataInfo &Info,
7282 bool EmitDebug,
7283 bool ForEndCall) {
7284 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7285 "expected region end call to runtime only when end call is separate");
7286 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7287 auto VoidPtrTy = UnqualPtrTy;
7288 auto VoidPtrPtrTy = UnqualPtrTy;
7289 auto Int64Ty = Type::getInt64Ty(M.getContext());
7290 auto Int64PtrTy = UnqualPtrTy;
7291
7292 if (!Info.NumberOfPtrs) {
7293 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7294 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7295 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7296 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7297 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7298 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7299 return;
7300 }
7301
7303 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7304 Info.RTArgs.BasePointersArray,
7305 /*Idx0=*/0, /*Idx1=*/0);
7307 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7308 /*Idx0=*/0,
7309 /*Idx1=*/0);
7311 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7312 /*Idx0=*/0, /*Idx1=*/0);
7314 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7315 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7316 : Info.RTArgs.MapTypesArray,
7317 /*Idx0=*/0,
7318 /*Idx1=*/0);
7319
7320 // Only emit the mapper information arrays if debug information is
7321 // requested.
7322 if (!EmitDebug)
7323 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7324 else
7326 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7327 /*Idx0=*/0,
7328 /*Idx1=*/0);
7329 // If there is no user-defined mapper, set the mapper array to nullptr to
7330 // avoid an unnecessary data privatization
7331 if (!Info.HasMapper)
7332 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7333 else
7334 RTArgs.MappersArray =
7335 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7336}
7337
7339 InsertPointTy CodeGenIP,
7340 MapInfosTy &CombinedInfo,
7341 TargetDataInfo &Info) {
7343 CombinedInfo.NonContigInfo;
7344
7345 // Build an array of struct descriptor_dim and then assign it to
7346 // offload_args.
7347 //
7348 // struct descriptor_dim {
7349 // uint64_t offset;
7350 // uint64_t count;
7351 // uint64_t stride
7352 // };
7353 Type *Int64Ty = Builder.getInt64Ty();
7355 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7356 "struct.descriptor_dim");
7357
7358 enum { OffsetFD = 0, CountFD, StrideFD };
7359 // We need two index variable here since the size of "Dims" is the same as
7360 // the size of Components, however, the size of offset, count, and stride is
7361 // equal to the size of base declaration that is non-contiguous.
7362 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7363 // Skip emitting ir if dimension size is 1 since it cannot be
7364 // non-contiguous.
7365 if (NonContigInfo.Dims[I] == 1)
7366 continue;
7367 Builder.restoreIP(AllocaIP);
7368 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7369 AllocaInst *DimsAddr =
7370 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7371 Builder.restoreIP(CodeGenIP);
7372 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7373 unsigned RevIdx = EE - II - 1;
7374 Value *DimsLVal = Builder.CreateInBoundsGEP(
7375 DimsAddr->getAllocatedType(), DimsAddr,
7376 {Builder.getInt64(0), Builder.getInt64(II)});
7377 // Offset
7378 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7380 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7381 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7382 // Count
7383 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7385 NonContigInfo.Counts[L][RevIdx], CountLVal,
7386 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7387 // Stride
7388 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7390 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7391 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7392 }
7393 // args[I] = &dims
7394 Builder.restoreIP(CodeGenIP);
7396 DimsAddr, Builder.getPtrTy());
7398 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7399 Info.RTArgs.PointersArray, 0, I);
7402 ++L;
7403 }
7404}
7405
7407 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
7408 TargetDataInfo &Info, bool IsNonContiguous,
7409 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7410 function_ref<Value *(unsigned int)> CustomMapperCB) {
7411
7412 // Reset the array information.
7413 Info.clearArrayInfo();
7414 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
7415
7416 if (Info.NumberOfPtrs == 0)
7417 return;
7418
7419 Builder.restoreIP(AllocaIP);
7420 // Detect if we have any capture size requiring runtime evaluation of the
7421 // size so that a constant array could be eventually used.
7422 ArrayType *PointerArrayType =
7423 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
7424
7425 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
7426 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
7427
7428 Info.RTArgs.PointersArray = Builder.CreateAlloca(
7429 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
7430 AllocaInst *MappersArray = Builder.CreateAlloca(
7431 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
7432 Info.RTArgs.MappersArray = MappersArray;
7433
7434 // If we don't have any VLA types or other types that require runtime
7435 // evaluation, we can use a constant array for the map sizes, otherwise we
7436 // need to fill up the arrays as we do for the pointers.
7437 Type *Int64Ty = Builder.getInt64Ty();
7438 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
7439 ConstantInt::get(Int64Ty, 0));
7440 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
7441 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
7442 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
7443 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
7444 if (IsNonContiguous &&
7445 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7446 CombinedInfo.Types[I] &
7447 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
7448 ConstSizes[I] =
7449 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
7450 else
7451 ConstSizes[I] = CI;
7452 continue;
7453 }
7454 }
7455 RuntimeSizes.set(I);
7456 }
7457
7458 if (RuntimeSizes.all()) {
7459 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7460 Info.RTArgs.SizesArray = Builder.CreateAlloca(
7461 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7462 Builder.restoreIP(CodeGenIP);
7463 } else {
7464 auto *SizesArrayInit = ConstantArray::get(
7465 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
7466 std::string Name = createPlatformSpecificName({"offload_sizes"});
7467 auto *SizesArrayGbl =
7468 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
7469 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
7470 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
7471
7472 if (!RuntimeSizes.any()) {
7473 Info.RTArgs.SizesArray = SizesArrayGbl;
7474 } else {
7475 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7476 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
7477 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7479 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7480 Buffer->setAlignment(OffloadSizeAlign);
7481 Builder.restoreIP(CodeGenIP);
7483 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
7484 SizesArrayGbl, OffloadSizeAlign,
7486 IndexSize,
7487 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
7488
7489 Info.RTArgs.SizesArray = Buffer;
7490 }
7491 Builder.restoreIP(CodeGenIP);
7492 }
7493
7494 // The map types are always constant so we don't need to generate code to
7495 // fill arrays. Instead, we create an array constant.
7497 for (auto mapFlag : CombinedInfo.Types)
7498 Mapping.push_back(
7499 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7500 mapFlag));
7501 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
7502 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7503 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
7504
7505 // The information types are only built if provided.
7506 if (!CombinedInfo.Names.empty()) {
7507 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
7508 auto *MapNamesArrayGbl =
7509 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
7510 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
7511 } else {
7512 Info.RTArgs.MapNamesArray =
7514 }
7515
7516 // If there's a present map type modifier, it must not be applied to the end
7517 // of a region, so generate a separate map type array in that case.
7518 if (Info.separateBeginEndCalls()) {
7519 bool EndMapTypesDiffer = false;
7520 for (uint64_t &Type : Mapping) {
7521 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7522 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
7523 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7524 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
7525 EndMapTypesDiffer = true;
7526 }
7527 }
7528 if (EndMapTypesDiffer) {
7529 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7530 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
7531 }
7532 }
7533
7534 PointerType *PtrTy = Builder.getPtrTy();
7535 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
7536 Value *BPVal = CombinedInfo.BasePointers[I];
7538 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
7539 0, I);
7540 Builder.CreateAlignedStore(BPVal, BP,
7542
7543 if (Info.requiresDevicePointerInfo()) {
7544 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
7545 CodeGenIP = Builder.saveIP();
7546 Builder.restoreIP(AllocaIP);
7547 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
7548 Builder.restoreIP(CodeGenIP);
7549 if (DeviceAddrCB)
7550 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
7551 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
7552 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
7553 if (DeviceAddrCB)
7554 DeviceAddrCB(I, BP);
7555 }
7556 }
7557
7558 Value *PVal = CombinedInfo.Pointers[I];
7560 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
7561 I);
7562 // TODO: Check alignment correct.
7565
7566 if (RuntimeSizes.test(I)) {
7568 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7569 /*Idx0=*/0,
7570 /*Idx1=*/I);
7572 Int64Ty,
7573 /*isSigned=*/true),
7574 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
7575 }
7576 // Fill up the mapper array.
7577 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7578 Value *MFunc = ConstantPointerNull::get(PtrTy);
7579 if (CustomMapperCB)
7580 if (Value *CustomMFunc = CustomMapperCB(I))
7581 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
7583 MappersArray->getAllocatedType(), MappersArray,
7584 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
7586 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
7587 }
7588
7589 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
7590 Info.NumberOfPtrs == 0)
7591 return;
7592 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
7593}
7594
7597
7598 if (!CurBB || CurBB->getTerminator()) {
7599 // If there is no insert point or the previous block is already
7600 // terminated, don't touch it.
7601 } else {
7602 // Otherwise, create a fall-through branch.
7604 }
7605
7607}
7608
7610 bool IsFinished) {
7612
7613 // Fall out of the current block (if necessary).
7614 emitBranch(BB);
7615
7616 if (IsFinished && BB->use_empty()) {
7617 BB->eraseFromParent();
7618 return;
7619 }
7620
7621 // Place the block after the current block, if possible, or else at
7622 // the end of the function.
7623 if (CurBB && CurBB->getParent())
7624 CurFn->insert(std::next(CurBB->getIterator()), BB);
7625 else
7626 CurFn->insert(CurFn->end(), BB);
7628}
7629
7631 BodyGenCallbackTy ElseGen,
7632 InsertPointTy AllocaIP) {
7633 // If the condition constant folds and can be elided, try to avoid emitting
7634 // the condition and the dead arm of the if/else.
7635 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
7636 auto CondConstant = CI->getSExtValue();
7637 if (CondConstant)
7638 ThenGen(AllocaIP, Builder.saveIP());
7639 else
7640 ElseGen(AllocaIP, Builder.saveIP());
7641 return;
7642 }
7643
7645
7646 // Otherwise, the condition did not fold, or we couldn't elide it. Just
7647 // emit the conditional branch.
7648 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
7649 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
7650 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
7651 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
7652 // Emit the 'then' code.
7653 emitBlock(ThenBlock, CurFn);
7654 ThenGen(AllocaIP, Builder.saveIP());
7655 emitBranch(ContBlock);
7656 // Emit the 'else' code if present.
7657 // There is no need to emit line number for unconditional branch.
7658 emitBlock(ElseBlock, CurFn);
7659 ElseGen(AllocaIP, Builder.saveIP());
7660 // There is no need to emit line number for unconditional branch.
7661 emitBranch(ContBlock);
7662 // Emit the continuation block for code after the if.
7663 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
7664}
7665
7666bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
7667 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
7670 "Unexpected Atomic Ordering.");
7671
7672 bool Flush = false;
7674
7675 switch (AK) {
7676 case Read:
7679 FlushAO = AtomicOrdering::Acquire;
7680 Flush = true;
7681 }
7682 break;
7683 case Write:
7684 case Compare:
7685 case Update:
7688 FlushAO = AtomicOrdering::Release;
7689 Flush = true;
7690 }
7691 break;
7692 case Capture:
7693 switch (AO) {
7695 FlushAO = AtomicOrdering::Acquire;
7696 Flush = true;
7697 break;
7699 FlushAO = AtomicOrdering::Release;
7700 Flush = true;
7701 break;
7705 Flush = true;
7706 break;
7707 default:
7708 // do nothing - leave silently.
7709 break;
7710 }
7711 }
7712
7713 if (Flush) {
7714 // Currently Flush RT call still doesn't take memory_ordering, so for when
7715 // that happens, this tries to do the resolution of which atomic ordering
7716 // to use with but issue the flush call
7717 // TODO: pass `FlushAO` after memory ordering support is added
7718 (void)FlushAO;
7719 emitFlush(Loc);
7720 }
7721
7722 // for AO == AtomicOrdering::Monotonic and all other case combinations
7723 // do nothing
7724 return Flush;
7725}
7726
7730 AtomicOrdering AO) {
7731 if (!updateToLocation(Loc))
7732 return Loc.IP;
7733
7734 assert(X.Var->getType()->isPointerTy() &&
7735 "OMP Atomic expects a pointer to target memory");
7736 Type *XElemTy = X.ElemTy;
7737 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7738 XElemTy->isPointerTy()) &&
7739 "OMP atomic read expected a scalar type");
7740
7741 Value *XRead = nullptr;
7742
7743 if (XElemTy->isIntegerTy()) {
7744 LoadInst *XLD =
7745 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
7746 XLD->setAtomic(AO);
7747 XRead = cast<Value>(XLD);
7748 } else {
7749 // We need to perform atomic op as integer
7750 IntegerType *IntCastTy =
7752 LoadInst *XLoad =
7753 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
7754 XLoad->setAtomic(AO);
7755 if (XElemTy->isFloatingPointTy()) {
7756 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
7757 } else {
7758 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
7759 }
7760 }
7761 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
7762 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
7763 return Builder.saveIP();
7764}
7765
7768 AtomicOpValue &X, Value *Expr,
7769 AtomicOrdering AO) {
7770 if (!updateToLocation(Loc))
7771 return Loc.IP;
7772
7773 assert(X.Var->getType()->isPointerTy() &&
7774 "OMP Atomic expects a pointer to target memory");
7775 Type *XElemTy = X.ElemTy;
7776 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7777 XElemTy->isPointerTy()) &&
7778 "OMP atomic write expected a scalar type");
7779
7780 if (XElemTy->isIntegerTy()) {
7781 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
7782 XSt->setAtomic(AO);
7783 } else {
7784 // We need to bitcast and perform atomic op as integers
7785 IntegerType *IntCastTy =
7787 Value *ExprCast =
7788 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
7789 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
7790 XSt->setAtomic(AO);
7791 }
7792
7793 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
7794 return Builder.saveIP();
7795}
7796
7798 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7799 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
7800 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
7801 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
7802 if (!updateToLocation(Loc))
7803 return Loc.IP;
7804
7805 LLVM_DEBUG({
7806 Type *XTy = X.Var->getType();
7807 assert(XTy->isPointerTy() &&
7808 "OMP Atomic expects a pointer to target memory");
7809 Type *XElemTy = X.ElemTy;
7810 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7811 XElemTy->isPointerTy()) &&
7812 "OMP atomic update expected a scalar type");
7813 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
7814 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
7815 "OpenMP atomic does not support LT or GT operations");
7816 });
7817
7818 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
7819 X.IsVolatile, IsXBinopExpr);
7820 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
7821 return Builder.saveIP();
7822}
7823
7824// FIXME: Duplicating AtomicExpand
7825Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
7826 AtomicRMWInst::BinOp RMWOp) {
7827 switch (RMWOp) {
7828 case AtomicRMWInst::Add:
7829 return Builder.CreateAdd(Src1, Src2);
7830 case AtomicRMWInst::Sub:
7831 return Builder.CreateSub(Src1, Src2);
7832 case AtomicRMWInst::And:
7833 return Builder.CreateAnd(Src1, Src2);
7835 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
7836 case AtomicRMWInst::Or:
7837 return Builder.CreateOr(Src1, Src2);
7838 case AtomicRMWInst::Xor:
7839 return Builder.CreateXor(Src1, Src2);
7844 case AtomicRMWInst::Max:
7845 case AtomicRMWInst::Min:
7852 llvm_unreachable("Unsupported atomic update operation");
7853 }
7854 llvm_unreachable("Unsupported atomic update operation");
7855}
7856
7857std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
7858 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
7860 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
7861 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
7862 // or a complex datatype.
7863 bool emitRMWOp = false;
7864 switch (RMWOp) {
7865 case AtomicRMWInst::Add:
7866 case AtomicRMWInst::And:
7868 case AtomicRMWInst::Or:
7869 case AtomicRMWInst::Xor:
7871 emitRMWOp = XElemTy;
7872 break;
7873 case AtomicRMWInst::Sub:
7874 emitRMWOp = (IsXBinopExpr && XElemTy);
7875 break;
7876 default:
7877 emitRMWOp = false;
7878 }
7879 emitRMWOp &= XElemTy->isIntegerTy();
7880
7881 std::pair<Value *, Value *> Res;
7882 if (emitRMWOp) {
7883 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
7884 // not needed except in case of postfix captures. Generate anyway for
7885 // consistency with the else part. Will be removed with any DCE pass.
7886 // AtomicRMWInst::Xchg does not have a coressponding instruction.
7887 if (RMWOp == AtomicRMWInst::Xchg)
7888 Res.second = Res.first;
7889 else
7890 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
7891 } else {
7892 IntegerType *IntCastTy =
7894 LoadInst *OldVal =
7895 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
7896 OldVal->setAtomic(AO);
7897 // CurBB
7898 // | /---\
7899 // ContBB |
7900 // | \---/
7901 // ExitBB
7903 Instruction *CurBBTI = CurBB->getTerminator();
7904 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
7905 BasicBlock *ExitBB =
7906 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
7907 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
7908 X->getName() + ".atomic.cont");
7909 ContBB->getTerminator()->eraseFromParent();
7910 Builder.restoreIP(AllocaIP);
7911 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
7912 NewAtomicAddr->setName(X->getName() + "x.new.val");
7913 Builder.SetInsertPoint(ContBB);
7914 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
7915 PHI->addIncoming(OldVal, CurBB);
7916 bool IsIntTy = XElemTy->isIntegerTy();
7917 Value *OldExprVal = PHI;
7918 if (!IsIntTy) {
7919 if (XElemTy->isFloatingPointTy()) {
7920 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
7921 X->getName() + ".atomic.fltCast");
7922 } else {
7923 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
7924 X->getName() + ".atomic.ptrCast");
7925 }
7926 }
7927
7928 Value *Upd = UpdateOp(OldExprVal, Builder);
7929 Builder.CreateStore(Upd, NewAtomicAddr);
7930 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
7934 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
7935 Result->setVolatile(VolatileX);
7936 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
7937 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
7938 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
7939 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
7940
7941 Res.first = OldExprVal;
7942 Res.second = Upd;
7943
7944 // set Insertion point in exit block
7945 if (UnreachableInst *ExitTI =
7946 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
7947 CurBBTI->eraseFromParent();
7948 Builder.SetInsertPoint(ExitBB);
7949 } else {
7950 Builder.SetInsertPoint(ExitTI);
7951 }
7952 }
7953
7954 return Res;
7955}
7956
7958 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7959 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
7961 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
7962 if (!updateToLocation(Loc))
7963 return Loc.IP;
7964
7965 LLVM_DEBUG({
7966 Type *XTy = X.Var->getType();
7967 assert(XTy->isPointerTy() &&
7968 "OMP Atomic expects a pointer to target memory");
7969 Type *XElemTy = X.ElemTy;
7970 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7971 XElemTy->isPointerTy()) &&
7972 "OMP atomic capture expected a scalar type");
7973 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
7974 "OpenMP atomic does not support LT or GT operations");
7975 });
7976
7977 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
7978 // 'x' is simply atomically rewritten with 'expr'.
7979 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
7980 std::pair<Value *, Value *> Result =
7981 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
7982 X.IsVolatile, IsXBinopExpr);
7983
7984 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
7985 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
7986
7987 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
7988 return Builder.saveIP();
7989}
7990
7994 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
7995 bool IsFailOnly) {
7996
7998 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
7999 IsPostfixUpdate, IsFailOnly, Failure);
8000}
8001
8005 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8006 bool IsFailOnly, AtomicOrdering Failure) {
8007
8008 if (!updateToLocation(Loc))
8009 return Loc.IP;
8010
8011 assert(X.Var->getType()->isPointerTy() &&
8012 "OMP atomic expects a pointer to target memory");
8013 // compare capture
8014 if (V.Var) {
8015 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8016 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8017 }
8018
8019 bool IsInteger = E->getType()->isIntegerTy();
8020
8021 if (Op == OMPAtomicCompareOp::EQ) {
8022 AtomicCmpXchgInst *Result = nullptr;
8023 if (!IsInteger) {
8024 IntegerType *IntCastTy =
8025 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8026 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8027 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8028 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8029 AO, Failure);
8030 } else {
8031 Result =
8032 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8033 }
8034
8035 if (V.Var) {
8036 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8037 if (!IsInteger)
8038 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8039 assert(OldValue->getType() == V.ElemTy &&
8040 "OldValue and V must be of same type");
8041 if (IsPostfixUpdate) {
8042 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8043 } else {
8044 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8045 if (IsFailOnly) {
8046 // CurBB----
8047 // | |
8048 // v |
8049 // ContBB |
8050 // | |
8051 // v |
8052 // ExitBB <-
8053 //
8054 // where ContBB only contains the store of old value to 'v'.
8056 Instruction *CurBBTI = CurBB->getTerminator();
8057 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8058 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8059 CurBBTI, X.Var->getName() + ".atomic.exit");
8060 BasicBlock *ContBB = CurBB->splitBasicBlock(
8061 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8062 ContBB->getTerminator()->eraseFromParent();
8063 CurBB->getTerminator()->eraseFromParent();
8064
8065 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8066
8067 Builder.SetInsertPoint(ContBB);
8068 Builder.CreateStore(OldValue, V.Var);
8069 Builder.CreateBr(ExitBB);
8070
8071 if (UnreachableInst *ExitTI =
8072 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8073 CurBBTI->eraseFromParent();
8074 Builder.SetInsertPoint(ExitBB);
8075 } else {
8076 Builder.SetInsertPoint(ExitTI);
8077 }
8078 } else {
8079 Value *CapturedValue =
8080 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8081 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8082 }
8083 }
8084 }
8085 // The comparison result has to be stored.
8086 if (R.Var) {
8087 assert(R.Var->getType()->isPointerTy() &&
8088 "r.var must be of pointer type");
8089 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8090
8091 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8092 Value *ResultCast = R.IsSigned
8093 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8094 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8095 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8096 }
8097 } else {
8098 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8099 "Op should be either max or min at this point");
8100 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8101
8102 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8103 // Let's take max as example.
8104 // OpenMP form:
8105 // x = x > expr ? expr : x;
8106 // LLVM form:
8107 // *ptr = *ptr > val ? *ptr : val;
8108 // We need to transform to LLVM form.
8109 // x = x <= expr ? x : expr;
8111 if (IsXBinopExpr) {
8112 if (IsInteger) {
8113 if (X.IsSigned)
8114 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8116 else
8117 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8119 } else {
8120 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8122 }
8123 } else {
8124 if (IsInteger) {
8125 if (X.IsSigned)
8126 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8128 else
8129 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8131 } else {
8132 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8134 }
8135 }
8136
8137 AtomicRMWInst *OldValue =
8138 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8139 if (V.Var) {
8140 Value *CapturedValue = nullptr;
8141 if (IsPostfixUpdate) {
8142 CapturedValue = OldValue;
8143 } else {
8144 CmpInst::Predicate Pred;
8145 switch (NewOp) {
8146 case AtomicRMWInst::Max:
8147 Pred = CmpInst::ICMP_SGT;
8148 break;
8150 Pred = CmpInst::ICMP_UGT;
8151 break;
8153 Pred = CmpInst::FCMP_OGT;
8154 break;
8155 case AtomicRMWInst::Min:
8156 Pred = CmpInst::ICMP_SLT;
8157 break;
8159 Pred = CmpInst::ICMP_ULT;
8160 break;
8162 Pred = CmpInst::FCMP_OLT;
8163 break;
8164 default:
8165 llvm_unreachable("unexpected comparison op");
8166 }
8167 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8168 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8169 }
8170 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8171 }
8172 }
8173
8174 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8175
8176 return Builder.saveIP();
8177}
8178
8181 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8182 Value *NumTeamsUpper, Value *ThreadLimit,
8183 Value *IfExpr) {
8184 if (!updateToLocation(Loc))
8185 return InsertPointTy();
8186
8187 uint32_t SrcLocStrSize;
8188 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8189 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8190 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8191
8192 // Outer allocation basicblock is the entry block of the current function.
8193 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8194 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8195 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8196 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8197 }
8198
8199 // The current basic block is split into four basic blocks. After outlining,
8200 // they will be mapped as follows:
8201 // ```
8202 // def current_fn() {
8203 // current_basic_block:
8204 // br label %teams.exit
8205 // teams.exit:
8206 // ; instructions after teams
8207 // }
8208 //
8209 // def outlined_fn() {
8210 // teams.alloca:
8211 // br label %teams.body
8212 // teams.body:
8213 // ; instructions within teams body
8214 // }
8215 // ```
8216 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8217 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8218 BasicBlock *AllocaBB =
8219 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8220
8221 bool SubClausesPresent =
8222 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8223 // Push num_teams
8224 if (!Config.isTargetDevice() && SubClausesPresent) {
8225 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8226 "if lowerbound is non-null, then upperbound must also be non-null "
8227 "for bounds on num_teams");
8228
8229 if (NumTeamsUpper == nullptr)
8230 NumTeamsUpper = Builder.getInt32(0);
8231
8232 if (NumTeamsLower == nullptr)
8233 NumTeamsLower = NumTeamsUpper;
8234
8235 if (IfExpr) {
8236 assert(IfExpr->getType()->isIntegerTy() &&
8237 "argument to if clause must be an integer value");
8238
8239 // upper = ifexpr ? upper : 1
8240 if (IfExpr->getType() != Int1)
8241 IfExpr = Builder.CreateICmpNE(IfExpr,
8242 ConstantInt::get(IfExpr->getType(), 0));
8243 NumTeamsUpper = Builder.CreateSelect(
8244 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8245
8246 // lower = ifexpr ? lower : 1
8247 NumTeamsLower = Builder.CreateSelect(
8248 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8249 }
8250
8251 if (ThreadLimit == nullptr)
8252 ThreadLimit = Builder.getInt32(0);
8253
8254 Value *ThreadNum = getOrCreateThreadID(Ident);
8256 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
8257 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
8258 }
8259 // Generate the body of teams.
8260 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
8261 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
8262 BodyGenCB(AllocaIP, CodeGenIP);
8263
8264 OutlineInfo OI;
8265 OI.EntryBB = AllocaBB;
8266 OI.ExitBB = ExitBB;
8267 OI.OuterAllocaBB = &OuterAllocaBB;
8268
8269 // Insert fake values for global tid and bound tid.
8271 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
8273 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
8275 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
8276
8277 auto HostPostOutlineCB = [this, Ident,
8278 ToBeDeleted](Function &OutlinedFn) mutable {
8279 // The stale call instruction will be replaced with a new call instruction
8280 // for runtime call with the outlined function.
8281
8282 assert(OutlinedFn.getNumUses() == 1 &&
8283 "there must be a single user for the outlined function");
8284 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8285 ToBeDeleted.push_back(StaleCI);
8286
8287 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
8288 "Outlined function must have two or three arguments only");
8289
8290 bool HasShared = OutlinedFn.arg_size() == 3;
8291
8292 OutlinedFn.getArg(0)->setName("global.tid.ptr");
8293 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
8294 if (HasShared)
8295 OutlinedFn.getArg(2)->setName("data");
8296
8297 // Call to the runtime function for teams in the current function.
8298 assert(StaleCI && "Error while outlining - no CallInst user found for the "
8299 "outlined function.");
8300 Builder.SetInsertPoint(StaleCI);
8301 SmallVector<Value *> Args = {
8302 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
8303 if (HasShared)
8304 Args.push_back(StaleCI->getArgOperand(2));
8306 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
8307 Args);
8308
8309 llvm::for_each(llvm::reverse(ToBeDeleted),
8310 [](Instruction *I) { I->eraseFromParent(); });
8311
8312 };
8313
8314 if (!Config.isTargetDevice())
8315 OI.PostOutlineCB = HostPostOutlineCB;
8316
8317 addOutlineInfo(std::move(OI));
8318
8319 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
8320
8321 return Builder.saveIP();
8322}
8323
8326 std::string VarName) {
8327 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
8329 Names.size()),
8330 Names);
8331 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
8332 M, MapNamesArrayInit->getType(),
8333 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
8334 VarName);
8335 return MapNamesArrayGlobal;
8336}
8337
8338// Create all simple and struct types exposed by the runtime and remember
8339// the llvm::PointerTypes of them for easy access later.
8340void OpenMPIRBuilder::initializeTypes(Module &M) {
8341 LLVMContext &Ctx = M.getContext();
8342 StructType *T;
8343#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
8344#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
8345 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
8346 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
8347#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
8348 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
8349 VarName##Ptr = PointerType::getUnqual(VarName);
8350#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
8351 T = StructType::getTypeByName(Ctx, StructName); \
8352 if (!T) \
8353 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
8354 VarName = T; \
8355 VarName##Ptr = PointerType::getUnqual(T);
8356#include "llvm/Frontend/OpenMP/OMPKinds.def"
8357}
8358
8361 SmallVectorImpl<BasicBlock *> &BlockVector) {
8363 BlockSet.insert(EntryBB);
8364 BlockSet.insert(ExitBB);
8365
8366 Worklist.push_back(EntryBB);
8367 while (!Worklist.empty()) {
8368 BasicBlock *BB = Worklist.pop_back_val();
8369 BlockVector.push_back(BB);
8370 for (BasicBlock *SuccBB : successors(BB))
8371 if (BlockSet.insert(SuccBB).second)
8372 Worklist.push_back(SuccBB);
8373 }
8374}
8375
8377 uint64_t Size, int32_t Flags,
8379 StringRef Name) {
8380 if (!Config.isGPU()) {
8382 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
8383 "omp_offloading_entries");
8384 return;
8385 }
8386 // TODO: Add support for global variables on the device after declare target
8387 // support.
8388 Function *Fn = dyn_cast<Function>(Addr);
8389 if (!Fn)
8390 return;
8391
8392 Module &M = *(Fn->getParent());
8393 LLVMContext &Ctx = M.getContext();
8394
8395 // Get "nvvm.annotations" metadata node.
8396 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
8397
8398 Metadata *MDVals[] = {
8399 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
8400 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
8401 // Append metadata to nvvm.annotations.
8402 MD->addOperand(MDNode::get(Ctx, MDVals));
8403
8404 // Add a function attribute for the kernel.
8405 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
8406 if (T.isAMDGCN())
8407 Fn->addFnAttr("uniform-work-group-size", "true");
8408 Fn->addFnAttr(Attribute::MustProgress);
8409}
8410
8411// We only generate metadata for function that contain target regions.
8414
8415 // If there are no entries, we don't need to do anything.
8417 return;
8418
8422 16>
8423 OrderedEntries(OffloadInfoManager.size());
8424
8425 // Auxiliary methods to create metadata values and strings.
8426 auto &&GetMDInt = [this](unsigned V) {
8427 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
8428 };
8429
8430 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
8431
8432 // Create the offloading info metadata node.
8433 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
8434 auto &&TargetRegionMetadataEmitter =
8435 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
8436 const TargetRegionEntryInfo &EntryInfo,
8438 // Generate metadata for target regions. Each entry of this metadata
8439 // contains:
8440 // - Entry 0 -> Kind of this type of metadata (0).
8441 // - Entry 1 -> Device ID of the file where the entry was identified.
8442 // - Entry 2 -> File ID of the file where the entry was identified.
8443 // - Entry 3 -> Mangled name of the function where the entry was
8444 // identified.
8445 // - Entry 4 -> Line in the file where the entry was identified.
8446 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
8447 // - Entry 6 -> Order the entry was created.
8448 // The first element of the metadata node is the kind.
8449 Metadata *Ops[] = {
8450 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
8451 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
8452 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
8453 GetMDInt(E.getOrder())};
8454
8455 // Save this entry in the right position of the ordered entries array.
8456 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
8457
8458 // Add metadata to the named metadata node.
8459 MD->addOperand(MDNode::get(C, Ops));
8460 };
8461
8462 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
8463
8464 // Create function that emits metadata for each device global variable entry;
8465 auto &&DeviceGlobalVarMetadataEmitter =
8466 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
8467 StringRef MangledName,
8469 // Generate metadata for global variables. Each entry of this metadata
8470 // contains:
8471 // - Entry 0 -> Kind of this type of metadata (1).
8472 // - Entry 1 -> Mangled name of the variable.
8473 // - Entry 2 -> Declare target kind.
8474 // - Entry 3 -> Order the entry was created.
8475 // The first element of the metadata node is the kind.
8476 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
8477 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
8478
8479 // Save this entry in the right position of the ordered entries array.
8480 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
8481 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
8482
8483 // Add metadata to the named metadata node.
8484 MD->addOperand(MDNode::get(C, Ops));
8485 };
8486
8488 DeviceGlobalVarMetadataEmitter);
8489
8490 for (const auto &E : OrderedEntries) {
8491 assert(E.first && "All ordered entries must exist!");
8492 if (const auto *CE =
8493 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
8494 E.first)) {
8495 if (!CE->getID() || !CE->getAddress()) {
8496 // Do not blame the entry if the parent funtion is not emitted.
8497 TargetRegionEntryInfo EntryInfo = E.second;
8498 StringRef FnName = EntryInfo.ParentName;
8499 if (!M.getNamedValue(FnName))
8500 continue;
8501 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
8502 continue;
8503 }
8504 createOffloadEntry(CE->getID(), CE->getAddress(),
8505 /*Size=*/0, CE->getFlags(),
8507 } else if (const auto *CE = dyn_cast<
8509 E.first)) {
8512 CE->getFlags());
8513 switch (Flags) {
8517 continue;
8518 if (!CE->getAddress()) {
8519 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
8520 continue;
8521 }
8522 // The vaiable has no definition - no need to add the entry.
8523 if (CE->getVarSize() == 0)
8524 continue;
8525 break;
8527 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
8528 (!Config.isTargetDevice() && CE->getAddress())) &&
8529 "Declaret target link address is set.");
8530 if (Config.isTargetDevice())
8531 continue;
8532 if (!CE->getAddress()) {
8534 continue;
8535 }
8536 break;
8537 default:
8538 break;
8539 }
8540
8541 // Hidden or internal symbols on the device are not externally visible.
8542 // We should not attempt to register them by creating an offloading
8543 // entry. Indirect variables are handled separately on the device.
8544 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
8545 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
8547 continue;
8548
8549 // Indirect globals need to use a special name that doesn't match the name
8550 // of the associated host global.
8552 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8553 Flags, CE->getLinkage(), CE->getVarName());
8554 else
8555 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8556 Flags, CE->getLinkage());
8557
8558 } else {
8559 llvm_unreachable("Unsupported entry kind.");
8560 }
8561 }
8562
8563 // Emit requires directive globals to a special entry so the runtime can
8564 // register them when the device image is loaded.
8565 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
8566 // entries should be redesigned to better suit this use-case.
8570 /*Name=*/"",
8572 Config.getRequiresFlags(), "omp_offloading_entries");
8573}
8574
8576 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
8577 unsigned FileID, unsigned Line, unsigned Count) {
8579 OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
8580 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
8581 if (Count)
8582 OS << "_" << Count;
8583}
8584
8587 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
8589 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
8590 EntryInfo.Line, NewCount);
8591}
8592
8595 StringRef ParentName) {
8597 auto FileIDInfo = CallBack();
8598 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
8599 report_fatal_error(("Unable to get unique ID for file, during "
8600 "getTargetEntryUniqueInfo, error message: " +
8601 EC.message())
8602 .c_str());
8603 }
8604
8605 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
8606 std::get<1>(FileIDInfo));
8607}
8608
8610 unsigned Offset = 0;
8611 for (uint64_t Remain =
8612 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8614 !(Remain & 1); Remain = Remain >> 1)
8615 Offset++;
8616 return Offset;
8617}
8618
8621 // Rotate by getFlagMemberOffset() bits.
8622 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
8623 << getFlagMemberOffset());
8624}
8625
8628 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
8629 // If the entry is PTR_AND_OBJ but has not been marked with the special
8630 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
8631 // marked as MEMBER_OF.
8632 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8634 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8637 return;
8638
8639 // Reset the placeholder value to prepare the flag for the assignment of the
8640 // proper MEMBER_OF value.
8641 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
8642 Flags |= MemberOfFlag;
8643}
8644
8648 bool IsDeclaration, bool IsExternallyVisible,
8649 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8650 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8651 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
8652 std::function<Constant *()> GlobalInitializer,
8653 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
8654 // TODO: convert this to utilise the IRBuilder Config rather than
8655 // a passed down argument.
8656 if (OpenMPSIMD)
8657 return nullptr;
8658
8661 CaptureClause ==
8664 SmallString<64> PtrName;
8665 {
8666 raw_svector_ostream OS(PtrName);
8667 OS << MangledName;
8668 if (!IsExternallyVisible)
8669 OS << format("_%x", EntryInfo.FileID);
8670 OS << "_decl_tgt_ref_ptr";
8671 }
8672
8673 Value *Ptr = M.getNamedValue(PtrName);
8674
8675 if (!Ptr) {
8676 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
8677 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
8678
8679 auto *GV = cast<GlobalVariable>(Ptr);
8680 GV->setLinkage(GlobalValue::WeakAnyLinkage);
8681
8682 if (!Config.isTargetDevice()) {
8683 if (GlobalInitializer)
8684 GV->setInitializer(GlobalInitializer());
8685 else
8686 GV->setInitializer(GlobalValue);
8687 }
8688
8690 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8691 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8692 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
8693 }
8694
8695 return cast<Constant>(Ptr);
8696 }
8697
8698 return nullptr;
8699}
8700
8704 bool IsDeclaration, bool IsExternallyVisible,
8705 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8706 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8707 std::vector<Triple> TargetTriple,
8708 std::function<Constant *()> GlobalInitializer,
8709 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
8710 Constant *Addr) {
8712 (TargetTriple.empty() && !Config.isTargetDevice()))
8713 return;
8714
8716 StringRef VarName;
8717 int64_t VarSize;
8719
8721 CaptureClause ==
8725 VarName = MangledName;
8726 GlobalValue *LlvmVal = M.getNamedValue(VarName);
8727
8728 if (!IsDeclaration)
8729 VarSize = divideCeil(
8731 else
8732 VarSize = 0;
8733 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
8734
8735 // This is a workaround carried over from Clang which prevents undesired
8736 // optimisation of internal variables.
8737 if (Config.isTargetDevice() &&
8738 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
8739 // Do not create a "ref-variable" if the original is not also available
8740 // on the host.
8742 return;
8743
8744 std::string RefName = createPlatformSpecificName({VarName, "ref"});
8745
8746 if (!M.getNamedValue(RefName)) {
8747 Constant *AddrRef =
8748 getOrCreateInternalVariable(Addr->getType(), RefName);
8749 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
8750 GvAddrRef->setConstant(true);
8751 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
8752 GvAddrRef->setInitializer(Addr);
8753 GeneratedRefs.push_back(GvAddrRef);
8754 }
8755 }
8756 } else {
8759 else
8761
8762 if (Config.isTargetDevice()) {
8763 VarName = (Addr) ? Addr->getName() : "";
8764 Addr = nullptr;
8765 } else {
8767 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8768 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8769 LlvmPtrTy, GlobalInitializer, VariableLinkage);
8770 VarName = (Addr) ? Addr->getName() : "";
8771 }
8772 VarSize = M.getDataLayout().getPointerSize();
8774 }
8775
8777 Flags, Linkage);
8778}
8779
8780/// Loads all the offload entries information from the host IR
8781/// metadata.
8783 // If we are in target mode, load the metadata from the host IR. This code has
8784 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
8785
8787 if (!MD)
8788 return;
8789
8790 for (MDNode *MN : MD->operands()) {
8791 auto &&GetMDInt = [MN](unsigned Idx) {
8792 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
8793 return cast<ConstantInt>(V->getValue())->getZExtValue();
8794 };
8795
8796 auto &&GetMDString = [MN](unsigned Idx) {
8797 auto *V = cast<MDString>(MN->getOperand(Idx));
8798 return V->getString();
8799 };
8800
8801 switch (GetMDInt(0)) {
8802 default:
8803 llvm_unreachable("Unexpected metadata!");
8804 break;
8807 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
8808 /*DeviceID=*/GetMDInt(1),
8809 /*FileID=*/GetMDInt(2),
8810 /*Line=*/GetMDInt(4),
8811 /*Count=*/GetMDInt(5));
8813 /*Order=*/GetMDInt(6));
8814 break;
8815 }
8819 /*MangledName=*/GetMDString(1),
8821 /*Flags=*/GetMDInt(2)),
8822 /*Order=*/GetMDInt(3));
8823 break;
8824 }
8825 }
8826}
8827
8829 if (HostFilePath.empty())
8830 return;
8831
8832 auto Buf = MemoryBuffer::getFile(HostFilePath);
8833 if (std::error_code Err = Buf.getError()) {
8834 report_fatal_error(("error opening host file from host file path inside of "
8835 "OpenMPIRBuilder: " +
8836 Err.message())
8837 .c_str());
8838 }
8839
8840 LLVMContext Ctx;
8842 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
8843 if (std::error_code Err = M.getError()) {
8845 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
8846 .c_str());
8847 }
8848
8849 loadOffloadInfoMetadata(*M.get());
8850}
8851
8852//===----------------------------------------------------------------------===//
8853// OffloadEntriesInfoManager
8854//===----------------------------------------------------------------------===//
8855
8857 return OffloadEntriesTargetRegion.empty() &&
8858 OffloadEntriesDeviceGlobalVar.empty();
8859}
8860
8861unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
8862 const TargetRegionEntryInfo &EntryInfo) const {
8863 auto It = OffloadEntriesTargetRegionCount.find(
8864 getTargetRegionEntryCountKey(EntryInfo));
8865 if (It == OffloadEntriesTargetRegionCount.end())
8866 return 0;
8867 return It->second;
8868}
8869
8870void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
8871 const TargetRegionEntryInfo &EntryInfo) {
8872 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
8873 EntryInfo.Count + 1;
8874}
8875
8876/// Initialize target region entry.
8878 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
8879 OffloadEntriesTargetRegion[EntryInfo] =
8880 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
8881 OMPTargetRegionEntryTargetRegion);
8882 ++OffloadingEntriesNum;
8883}
8884
8888 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
8889
8890 // Update the EntryInfo with the next available count for this location.
8891 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8892
8893 // If we are emitting code for a target, the entry is already initialized,
8894 // only has to be registered.
8895 if (OMPBuilder->Config.isTargetDevice()) {
8896 // This could happen if the device compilation is invoked standalone.
8897 if (!hasTargetRegionEntryInfo(EntryInfo)) {
8898 return;
8899 }
8900 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
8901 Entry.setAddress(Addr);
8902 Entry.setID(ID);
8903 Entry.setFlags(Flags);
8904 } else {
8906 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
8907 return;
8908 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
8909 "Target region entry already registered!");
8910 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
8911 OffloadEntriesTargetRegion[EntryInfo] = Entry;
8912 ++OffloadingEntriesNum;
8913 }
8914 incrementTargetRegionEntryInfoCount(EntryInfo);
8915}
8916
8918 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
8919
8920 // Update the EntryInfo with the next available count for this location.
8921 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8922
8923 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
8924 if (It == OffloadEntriesTargetRegion.end()) {
8925 return false;
8926 }
8927 // Fail if this entry is already registered.
8928 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
8929 return false;
8930 return true;
8931}
8932
8934 const OffloadTargetRegionEntryInfoActTy &Action) {
8935 // Scan all target region entries and perform the provided action.
8936 for (const auto &It : OffloadEntriesTargetRegion) {
8937 Action(It.first, It.second);
8938 }
8939}
8940
8942 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
8943 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
8944 ++OffloadingEntriesNum;
8945}
8946
8948 StringRef VarName, Constant *Addr, int64_t VarSize,
8950 if (OMPBuilder->Config.isTargetDevice()) {
8951 // This could happen if the device compilation is invoked standalone.
8952 if (!hasDeviceGlobalVarEntryInfo(VarName))
8953 return;
8954 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
8955 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
8956 if (Entry.getVarSize() == 0) {
8957 Entry.setVarSize(VarSize);
8958 Entry.setLinkage(Linkage);
8959 }
8960 return;
8961 }
8962 Entry.setVarSize(VarSize);
8963 Entry.setLinkage(Linkage);
8964 Entry.setAddress(Addr);
8965 } else {
8966 if (hasDeviceGlobalVarEntryInfo(VarName)) {
8967 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
8968 assert(Entry.isValid() && Entry.getFlags() == Flags &&
8969 "Entry not initialized!");
8970 if (Entry.getVarSize() == 0) {
8971 Entry.setVarSize(VarSize);
8972 Entry.setLinkage(Linkage);
8973 }
8974 return;
8975 }
8977 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
8978 Addr, VarSize, Flags, Linkage,
8979 VarName.str());
8980 else
8981 OffloadEntriesDeviceGlobalVar.try_emplace(
8982 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
8983 ++OffloadingEntriesNum;
8984 }
8985}
8986
8989 // Scan all target region entries and perform the provided action.
8990 for (const auto &E : OffloadEntriesDeviceGlobalVar)
8991 Action(E.getKey(), E.getValue());
8992}
8993
8994//===----------------------------------------------------------------------===//
8995// CanonicalLoopInfo
8996//===----------------------------------------------------------------------===//
8997
8998void CanonicalLoopInfo::collectControlBlocks(
9000 // We only count those BBs as control block for which we do not need to
9001 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9002 // flow. For consistency, this also means we do not add the Body block, which
9003 // is just the entry to the body code.
9004 BBs.reserve(BBs.size() + 6);
9005 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9006}
9007
9009 assert(isValid() && "Requires a valid canonical loop");
9010 for (BasicBlock *Pred : predecessors(Header)) {
9011 if (Pred != Latch)
9012 return Pred;
9013 }
9014 llvm_unreachable("Missing preheader");
9015}
9016
9017void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9018 assert(isValid() && "Requires a valid canonical loop");
9019
9020 Instruction *CmpI = &getCond()->front();
9021 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9022 CmpI->setOperand(1, TripCount);
9023
9024#ifndef NDEBUG
9025 assertOK();
9026#endif
9027}
9028
9029void CanonicalLoopInfo::mapIndVar(
9030 llvm::function_ref<Value *(Instruction *)> Updater) {
9031 assert(isValid() && "Requires a valid canonical loop");
9032
9033 Instruction *OldIV = getIndVar();
9034
9035 // Record all uses excluding those introduced by the updater. Uses by the
9036 // CanonicalLoopInfo itself to keep track of the number of iterations are
9037 // excluded.
9038 SmallVector<Use *> ReplacableUses;
9039 for (Use &U : OldIV->uses()) {
9040 auto *User = dyn_cast<Instruction>(U.getUser());
9041 if (!User)
9042 continue;
9043 if (User->getParent() == getCond())
9044 continue;
9045 if (User->getParent() == getLatch())
9046 continue;
9047 ReplacableUses.push_back(&U);
9048 }
9049
9050 // Run the updater that may introduce new uses
9051 Value *NewIV = Updater(OldIV);
9052
9053 // Replace the old uses with the value returned by the updater.
9054 for (Use *U : ReplacableUses)
9055 U->set(NewIV);
9056
9057#ifndef NDEBUG
9058 assertOK();
9059#endif
9060}
9061
9063#ifndef NDEBUG
9064 // No constraints if this object currently does not describe a loop.
9065 if (!isValid())
9066 return;
9067
9068 BasicBlock *Preheader = getPreheader();
9069 BasicBlock *Body = getBody();
9070 BasicBlock *After = getAfter();
9071
9072 // Verify standard control-flow we use for OpenMP loops.
9073 assert(Preheader);
9074 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9075 "Preheader must terminate with unconditional branch");
9076 assert(Preheader->getSingleSuccessor() == Header &&
9077 "Preheader must jump to header");
9078
9079 assert(Header);
9080 assert(isa<BranchInst>(Header->getTerminator()) &&
9081 "Header must terminate with unconditional branch");
9082 assert(Header->getSingleSuccessor() == Cond &&
9083 "Header must jump to exiting block");
9084
9085 assert(Cond);
9086 assert(Cond->getSinglePredecessor() == Header &&
9087 "Exiting block only reachable from header");
9088
9089 assert(isa<BranchInst>(Cond->getTerminator()) &&
9090 "Exiting block must terminate with conditional branch");
9091 assert(size(successors(Cond)) == 2 &&
9092 "Exiting block must have two successors");
9093 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9094 "Exiting block's first successor jump to the body");
9095 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9096 "Exiting block's second successor must exit the loop");
9097
9098 assert(Body);
9099 assert(Body->getSinglePredecessor() == Cond &&
9100 "Body only reachable from exiting block");
9101 assert(!isa<PHINode>(Body->front()));
9102
9103 assert(Latch);
9104 assert(isa<BranchInst>(Latch->getTerminator()) &&
9105 "Latch must terminate with unconditional branch");
9106 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9107 // TODO: To support simple redirecting of the end of the body code that has
9108 // multiple; introduce another auxiliary basic block like preheader and after.
9109 assert(Latch->getSinglePredecessor() != nullptr);
9110 assert(!isa<PHINode>(Latch->front()));
9111
9112 assert(Exit);
9113 assert(isa<BranchInst>(Exit->getTerminator()) &&
9114 "Exit block must terminate with unconditional branch");
9115 assert(Exit->getSingleSuccessor() == After &&
9116 "Exit block must jump to after block");
9117
9118 assert(After);
9119 assert(After->getSinglePredecessor() == Exit &&
9120 "After block only reachable from exit block");
9121 assert(After->empty() || !isa<PHINode>(After->front()));
9122
9123 Instruction *IndVar = getIndVar();
9124 assert(IndVar && "Canonical induction variable not found?");
9125 assert(isa<IntegerType>(IndVar->getType()) &&
9126 "Induction variable must be an integer");
9127 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9128 "Induction variable must be a PHI in the loop header");
9129 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9130 assert(
9131 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9132 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9133
9134 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9135 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9136 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9137 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9138 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9139 ->isOne());
9140
9141 Value *TripCount = getTripCount();
9142 assert(TripCount && "Loop trip count not found?");
9143 assert(IndVar->getType() == TripCount->getType() &&
9144 "Trip count and induction variable must have the same type");
9145
9146 auto *CmpI = cast<CmpInst>(&Cond->front());
9147 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9148 "Exit condition must be a signed less-than comparison");
9149 assert(CmpI->getOperand(0) == IndVar &&
9150 "Exit condition must compare the induction variable");
9151 assert(CmpI->getOperand(1) == TripCount &&
9152 "Exit condition must compare with the trip count");
9153#endif
9154}
9155
9157 Header = nullptr;
9158 Cond = nullptr;
9159 Latch = nullptr;
9160 Exit = nullptr;
9161}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={})
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:61
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:122
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:97
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:115
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:102
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:126
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:93
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:467
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
Class to represent array types.
Definition: DerivedTypes.h:371
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:644
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ FSub
*p = old - v
Definition: Instructions.h:736
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:748
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:752
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:577
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:865
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:850
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:391
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:451
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:414
reverse_iterator rbegin()
Definition: BasicBlock.h:454
bool empty() const
Definition: BasicBlock.h:460
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:365
const Instruction & front() const
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:202
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:575
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:495
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:457
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:169
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:465
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:487
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:277
reverse_iterator rend()
Definition: BasicBlock.h:456
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:384
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:366
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:621
const Instruction & back() const
Definition: BasicBlock.h:463
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:290
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:514
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1385
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1391
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2938
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:706
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2215
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2230
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2295
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1800
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Debug location.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:294
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:533
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:276
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:750
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
Class to represent function types.
Definition: DerivedTypes.h:103
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:629
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:165
const BasicBlock & getEntryBlock() const
Definition: Function.h:800
bool empty() const
Definition: Function.h:822
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:443
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:745
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:757
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
const Function & getFunction() const
Definition: Function.h:163
iterator begin()
Definition: Function.h:816
arg_iterator arg_begin()
Definition: Function.h:831
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:353
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:657
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:745
size_t arg_size() const
Definition: Function.h:864
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:212
iterator end()
Definition: Function.h:818
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:278
Argument * getArg(unsigned i) const
Definition: Function.h:849
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1521
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:52
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:254
BasicBlock * getBlock() const
Definition: IRBuilder.h:269
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:267
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:270
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1107
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2262
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1846
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1778
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2528
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1812
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2044
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1268
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2175
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1307
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1976
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:578
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2038
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1341
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1879
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2187
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1383
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2250
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:518
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1726
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:274
Constant * CreateGlobalStringPtr(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Same as CreateGlobalString, but return a pointer with "i8*" type instead of a pointer to array of i8.
Definition: IRBuilder.h:1997
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2371
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2402
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1148
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2246
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1349
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:494
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1125
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1795
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2026
LLVMContext & getContext() const
Definition: IRBuilder.h:173
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1480
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1095
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1917
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1963
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1808
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1332
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2554
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1859
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2012
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1502
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:566
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1119
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2278
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:478
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2258
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2201
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:286
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2549
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:561
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1831
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2417
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1461
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1524
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2356
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1409
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:656
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2059
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1366
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:938
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:120
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
NamedMDNode * getNamedMetadata(const Twine &Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:262
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:284
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:297
iterator_range< global_iterator > globals()
Definition: Module.h:701
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:613
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:446
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:135
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:271
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:461
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
A tuple of MDNodes.
Definition: Metadata.h:1729
iterator_range< op_iterator > operands()
Definition: Metadata.h:1825
void addOperand(MDNode *M)
Definition: Metadata.cpp:1387
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:236
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:238
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:369
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:371
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:289
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:291
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:280
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:349
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:355
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:359
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:353
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:351
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:425
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:92
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:184
StringRef separator() const
Definition: OMPIRBuilder.h:170
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:160
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:105
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:143
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:137
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:180
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:466
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:512
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
InsertPointTy createTarget(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={})
Generator for '#omp target'.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
InsertPointTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool EmitDebug=false, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:492
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP, SmallVector< OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
void setAlignment(Align Align)
Definition: Instructions.h:333
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:360
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:685
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:436
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:601
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:513
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:955
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1013
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1023
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:127
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:143
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:812
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:64
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:788
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
AddressSpace
Definition: NVPTXBaseInfo.h:21
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:872
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:608
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * NumTeams
The number of teams.
Value * DynCGGroupMem
The size of the dynamic shared memory.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
Value * NumThreads
The number of threads.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:198
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61