LLVM 19.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
45#include "llvm/IR/Value.h"
57
58#include <cstdint>
59#include <optional>
60#include <stack>
61
62#define DEBUG_TYPE "openmp-ir-builder"
63
64using namespace llvm;
65using namespace omp;
66
67static cl::opt<bool>
68 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
69 cl::desc("Use optimistic attributes describing "
70 "'as-if' properties of runtime calls."),
71 cl::init(false));
72
74 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
75 cl::desc("Factor for the unroll threshold to account for code "
76 "simplifications still taking place"),
77 cl::init(1.5));
78
79#ifndef NDEBUG
80/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
81/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
82/// an InsertPoint stores the instruction before something is inserted. For
83/// instance, if both point to the same instruction, two IRBuilders alternating
84/// creating instruction will cause the instructions to be interleaved.
87 if (!IP1.isSet() || !IP2.isSet())
88 return false;
89 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
90}
91
93 // Valid ordered/unordered and base algorithm combinations.
94 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
95 case OMPScheduleType::UnorderedStaticChunked:
96 case OMPScheduleType::UnorderedStatic:
97 case OMPScheduleType::UnorderedDynamicChunked:
98 case OMPScheduleType::UnorderedGuidedChunked:
99 case OMPScheduleType::UnorderedRuntime:
100 case OMPScheduleType::UnorderedAuto:
101 case OMPScheduleType::UnorderedTrapezoidal:
102 case OMPScheduleType::UnorderedGreedy:
103 case OMPScheduleType::UnorderedBalanced:
104 case OMPScheduleType::UnorderedGuidedIterativeChunked:
105 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
106 case OMPScheduleType::UnorderedSteal:
107 case OMPScheduleType::UnorderedStaticBalancedChunked:
108 case OMPScheduleType::UnorderedGuidedSimd:
109 case OMPScheduleType::UnorderedRuntimeSimd:
110 case OMPScheduleType::OrderedStaticChunked:
111 case OMPScheduleType::OrderedStatic:
112 case OMPScheduleType::OrderedDynamicChunked:
113 case OMPScheduleType::OrderedGuidedChunked:
114 case OMPScheduleType::OrderedRuntime:
115 case OMPScheduleType::OrderedAuto:
116 case OMPScheduleType::OrderdTrapezoidal:
117 case OMPScheduleType::NomergeUnorderedStaticChunked:
118 case OMPScheduleType::NomergeUnorderedStatic:
119 case OMPScheduleType::NomergeUnorderedDynamicChunked:
120 case OMPScheduleType::NomergeUnorderedGuidedChunked:
121 case OMPScheduleType::NomergeUnorderedRuntime:
122 case OMPScheduleType::NomergeUnorderedAuto:
123 case OMPScheduleType::NomergeUnorderedTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedGreedy:
125 case OMPScheduleType::NomergeUnorderedBalanced:
126 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
128 case OMPScheduleType::NomergeUnorderedSteal:
129 case OMPScheduleType::NomergeOrderedStaticChunked:
130 case OMPScheduleType::NomergeOrderedStatic:
131 case OMPScheduleType::NomergeOrderedDynamicChunked:
132 case OMPScheduleType::NomergeOrderedGuidedChunked:
133 case OMPScheduleType::NomergeOrderedRuntime:
134 case OMPScheduleType::NomergeOrderedAuto:
135 case OMPScheduleType::NomergeOrderedTrapezoidal:
136 break;
137 default:
138 return false;
139 }
140
141 // Must not set both monotonicity modifiers at the same time.
142 OMPScheduleType MonotonicityFlags =
143 SchedType & OMPScheduleType::MonotonicityMask;
144 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
145 return false;
146
147 return true;
148}
149#endif
150
151static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
152 if (T.isAMDGPU()) {
153 StringRef Features =
154 Kernel->getFnAttribute("target-features").getValueAsString();
155 if (Features.count("+wavefrontsize64"))
156 return omp::getAMDGPUGridValues<64>();
157 return omp::getAMDGPUGridValues<32>();
158 }
159 if (T.isNVPTX())
161 llvm_unreachable("No grid value available for this architecture!");
162}
163
164/// Determine which scheduling algorithm to use, determined from schedule clause
165/// arguments.
166static OMPScheduleType
167getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
168 bool HasSimdModifier) {
169 // Currently, the default schedule it static.
170 switch (ClauseKind) {
171 case OMP_SCHEDULE_Default:
172 case OMP_SCHEDULE_Static:
173 return HasChunks ? OMPScheduleType::BaseStaticChunked
174 : OMPScheduleType::BaseStatic;
175 case OMP_SCHEDULE_Dynamic:
176 return OMPScheduleType::BaseDynamicChunked;
177 case OMP_SCHEDULE_Guided:
178 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
179 : OMPScheduleType::BaseGuidedChunked;
180 case OMP_SCHEDULE_Auto:
182 case OMP_SCHEDULE_Runtime:
183 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
184 : OMPScheduleType::BaseRuntime;
185 }
186 llvm_unreachable("unhandled schedule clause argument");
187}
188
189/// Adds ordering modifier flags to schedule type.
190static OMPScheduleType
192 bool HasOrderedClause) {
193 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
194 OMPScheduleType::None &&
195 "Must not have ordering nor monotonicity flags already set");
196
197 OMPScheduleType OrderingModifier = HasOrderedClause
198 ? OMPScheduleType::ModifierOrdered
199 : OMPScheduleType::ModifierUnordered;
200 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
201
202 // Unsupported combinations
203 if (OrderingScheduleType ==
204 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedGuidedChunked;
206 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
207 OMPScheduleType::ModifierOrdered))
208 return OMPScheduleType::OrderedRuntime;
209
210 return OrderingScheduleType;
211}
212
213/// Adds monotonicity modifier flags to schedule type.
214static OMPScheduleType
216 bool HasSimdModifier, bool HasMonotonic,
217 bool HasNonmonotonic, bool HasOrderedClause) {
218 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
219 OMPScheduleType::None &&
220 "Must not have monotonicity flags already set");
221 assert((!HasMonotonic || !HasNonmonotonic) &&
222 "Monotonic and Nonmonotonic are contradicting each other");
223
224 if (HasMonotonic) {
225 return ScheduleType | OMPScheduleType::ModifierMonotonic;
226 } else if (HasNonmonotonic) {
227 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
228 } else {
229 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
230 // If the static schedule kind is specified or if the ordered clause is
231 // specified, and if the nonmonotonic modifier is not specified, the
232 // effect is as if the monotonic modifier is specified. Otherwise, unless
233 // the monotonic modifier is specified, the effect is as if the
234 // nonmonotonic modifier is specified.
235 OMPScheduleType BaseScheduleType =
236 ScheduleType & ~OMPScheduleType::ModifierMask;
237 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
238 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
239 HasOrderedClause) {
240 // The monotonic is used by default in openmp runtime library, so no need
241 // to set it.
242 return ScheduleType;
243 } else {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 }
246 }
247}
248
249/// Determine the schedule type using schedule and ordering clause arguments.
250static OMPScheduleType
251computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
252 bool HasSimdModifier, bool HasMonotonicModifier,
253 bool HasNonmonotonicModifier, bool HasOrderedClause) {
254 OMPScheduleType BaseSchedule =
255 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
256 OMPScheduleType OrderedSchedule =
257 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
259 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
260 HasNonmonotonicModifier, HasOrderedClause);
261
263 return Result;
264}
265
266/// Make \p Source branch to \p Target.
267///
268/// Handles two situations:
269/// * \p Source already has an unconditional branch.
270/// * \p Source is a degenerate block (no terminator because the BB is
271/// the current head of the IR construction).
273 if (Instruction *Term = Source->getTerminator()) {
274 auto *Br = cast<BranchInst>(Term);
275 assert(!Br->isConditional() &&
276 "BB's terminator must be an unconditional branch (or degenerate)");
277 BasicBlock *Succ = Br->getSuccessor(0);
278 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
279 Br->setSuccessor(0, Target);
280 return;
281 }
282
283 auto *NewBr = BranchInst::Create(Target, Source);
284 NewBr->setDebugLoc(DL);
285}
286
288 bool CreateBranch) {
289 assert(New->getFirstInsertionPt() == New->begin() &&
290 "Target BB must not have PHI nodes");
291
292 // Move instructions to new block.
293 BasicBlock *Old = IP.getBlock();
294 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
295
296 if (CreateBranch)
297 BranchInst::Create(New, Old);
298}
299
300void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
302 BasicBlock *Old = Builder.GetInsertBlock();
303
304 spliceBB(Builder.saveIP(), New, CreateBranch);
305 if (CreateBranch)
306 Builder.SetInsertPoint(Old->getTerminator());
307 else
308 Builder.SetInsertPoint(Old);
309
310 // SetInsertPoint also updates the Builder's debug location, but we want to
311 // keep the one the Builder was configured to use.
313}
314
317 BasicBlock *Old = IP.getBlock();
319 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
320 Old->getParent(), Old->getNextNode());
321 spliceBB(IP, New, CreateBranch);
322 New->replaceSuccessorsPhiUsesWith(Old, New);
323 return New;
324}
325
326BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
329 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
330 if (CreateBranch)
331 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
332 else
333 Builder.SetInsertPoint(Builder.GetInsertBlock());
334 // SetInsertPoint also updates the Builder's debug location, but we want to
335 // keep the one the Builder was configured to use.
337 return New;
338}
339
340BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
343 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
344 if (CreateBranch)
345 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
346 else
347 Builder.SetInsertPoint(Builder.GetInsertBlock());
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
351 return New;
352}
353
355 llvm::Twine Suffix) {
356 BasicBlock *Old = Builder.GetInsertBlock();
357 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
358}
359
360// This function creates a fake integer value and a fake use for the integer
361// value. It returns the fake value created. This is useful in modeling the
362// extra arguments to the outlined functions.
364 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
365 std::stack<Instruction *> &ToBeDeleted,
366 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
367 const Twine &Name = "", bool AsPtr = true) {
368 Builder.restoreIP(OuterAllocaIP);
369 Instruction *FakeVal;
370 AllocaInst *FakeValAddr =
371 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
372 ToBeDeleted.push(FakeValAddr);
373
374 if (AsPtr) {
375 FakeVal = FakeValAddr;
376 } else {
377 FakeVal =
378 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
379 ToBeDeleted.push(FakeVal);
380 }
381
382 // Generate a fake use of this value
383 Builder.restoreIP(InnerAllocaIP);
384 Instruction *UseFakeVal;
385 if (AsPtr) {
386 UseFakeVal =
387 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
388 } else {
389 UseFakeVal =
390 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
391 }
392 ToBeDeleted.push(UseFakeVal);
393 return FakeVal;
394}
395
396//===----------------------------------------------------------------------===//
397// OpenMPIRBuilderConfig
398//===----------------------------------------------------------------------===//
399
400namespace {
402/// Values for bit flags for marking which requires clauses have been used.
403enum OpenMPOffloadingRequiresDirFlags {
404 /// flag undefined.
405 OMP_REQ_UNDEFINED = 0x000,
406 /// no requires directive present.
407 OMP_REQ_NONE = 0x001,
408 /// reverse_offload clause.
409 OMP_REQ_REVERSE_OFFLOAD = 0x002,
410 /// unified_address clause.
411 OMP_REQ_UNIFIED_ADDRESS = 0x004,
412 /// unified_shared_memory clause.
413 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
414 /// dynamic_allocators clause.
415 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
416 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
417};
418
419} // anonymous namespace
420
422 : RequiresFlags(OMP_REQ_UNDEFINED) {}
423
425 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
426 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
427 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
428 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
429 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
430 RequiresFlags(OMP_REQ_UNDEFINED) {
431 if (HasRequiresReverseOffload)
432 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
433 if (HasRequiresUnifiedAddress)
434 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
435 if (HasRequiresUnifiedSharedMemory)
436 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
437 if (HasRequiresDynamicAllocators)
438 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
439}
440
442 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
443}
444
446 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
447}
448
450 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
451}
452
454 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
455}
456
458 return hasRequiresFlags() ? RequiresFlags
459 : static_cast<int64_t>(OMP_REQ_NONE);
460}
461
463 if (Value)
464 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
465 else
466 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
467}
468
470 if (Value)
471 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
472 else
473 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
474}
475
477 if (Value)
478 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
479 else
480 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 if (Value)
485 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
486 else
487 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
488}
489
490//===----------------------------------------------------------------------===//
491// OpenMPIRBuilder
492//===----------------------------------------------------------------------===//
493
495 IRBuilderBase &Builder,
496 SmallVector<Value *> &ArgsVector) {
498 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
499 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
500 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
501 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
502
503 Value *NumTeams3D =
504 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
505 Value *NumThreads3D =
506 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
507
508 ArgsVector = {Version,
509 PointerNum,
510 KernelArgs.RTArgs.BasePointersArray,
511 KernelArgs.RTArgs.PointersArray,
512 KernelArgs.RTArgs.SizesArray,
513 KernelArgs.RTArgs.MapTypesArray,
514 KernelArgs.RTArgs.MapNamesArray,
515 KernelArgs.RTArgs.MappersArray,
516 KernelArgs.NumIterations,
517 Flags,
518 NumTeams3D,
519 NumThreads3D,
520 KernelArgs.DynCGGroupMem};
521}
522
524 LLVMContext &Ctx = Fn.getContext();
525
526 // Get the function's current attributes.
527 auto Attrs = Fn.getAttributes();
528 auto FnAttrs = Attrs.getFnAttrs();
529 auto RetAttrs = Attrs.getRetAttrs();
531 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
532 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
533
534 // Add AS to FnAS while taking special care with integer extensions.
535 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
536 bool Param = true) -> void {
537 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
538 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
539 if (HasSignExt || HasZeroExt) {
540 assert(AS.getNumAttributes() == 1 &&
541 "Currently not handling extension attr combined with others.");
542 if (Param) {
543 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
544 FnAS = FnAS.addAttribute(Ctx, AK);
545 } else if (auto AK =
546 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
547 FnAS = FnAS.addAttribute(Ctx, AK);
548 } else {
549 FnAS = FnAS.addAttributes(Ctx, AS);
550 }
551 };
552
553#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
554#include "llvm/Frontend/OpenMP/OMPKinds.def"
555
556 // Add attributes to the function declaration.
557 switch (FnID) {
558#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
559 case Enum: \
560 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
561 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
562 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
563 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
564 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
565 break;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567 default:
568 // Attributes are optional.
569 break;
570 }
571}
572
575 FunctionType *FnTy = nullptr;
576 Function *Fn = nullptr;
577
578 // Try to find the declation in the module first.
579 switch (FnID) {
580#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
581 case Enum: \
582 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
583 IsVarArg); \
584 Fn = M.getFunction(Str); \
585 break;
586#include "llvm/Frontend/OpenMP/OMPKinds.def"
587 }
588
589 if (!Fn) {
590 // Create a new declaration if we need one.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, ...) \
593 case Enum: \
594 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
595 break;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597 }
598
599 // Add information if the runtime function takes a callback function
600 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
601 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
602 LLVMContext &Ctx = Fn->getContext();
603 MDBuilder MDB(Ctx);
604 // Annotate the callback behavior of the runtime function:
605 // - The callback callee is argument number 2 (microtask).
606 // - The first two arguments of the callback callee are unknown (-1).
607 // - All variadic arguments to the runtime function are passed to the
608 // callback callee.
609 Fn->addMetadata(
610 LLVMContext::MD_callback,
612 2, {-1, -1}, /* VarArgsArePassed */ true)}));
613 }
614 }
615
616 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
617 << " with type " << *Fn->getFunctionType() << "\n");
618 addAttributes(FnID, *Fn);
619
620 } else {
621 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
622 << " with type " << *Fn->getFunctionType() << "\n");
623 }
624
625 assert(Fn && "Failed to create OpenMP runtime function");
626
627 return {FnTy, Fn};
628}
629
632 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
633 assert(Fn && "Failed to create OpenMP runtime function pointer");
634 return Fn;
635}
636
637void OpenMPIRBuilder::initialize() { initializeTypes(M); }
638
641 BasicBlock &EntryBlock = Function->getEntryBlock();
642 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
643
644 // Loop over blocks looking for constant allocas, skipping the entry block
645 // as any allocas there are already in the desired location.
646 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
647 Block++) {
648 for (auto Inst = Block->getReverseIterator()->begin();
649 Inst != Block->getReverseIterator()->end();) {
650 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
651 Inst++;
652 if (!isa<ConstantData>(AllocaInst->getArraySize()))
653 continue;
654 AllocaInst->moveBeforePreserving(MoveLocInst);
655 } else {
656 Inst++;
657 }
658 }
659 }
660}
661
663 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
665 SmallVector<OutlineInfo, 16> DeferredOutlines;
666 for (OutlineInfo &OI : OutlineInfos) {
667 // Skip functions that have not finalized yet; may happen with nested
668 // function generation.
669 if (Fn && OI.getFunction() != Fn) {
670 DeferredOutlines.push_back(OI);
671 continue;
672 }
673
674 ParallelRegionBlockSet.clear();
675 Blocks.clear();
676 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
677
678 Function *OuterFn = OI.getFunction();
679 CodeExtractorAnalysisCache CEAC(*OuterFn);
680 // If we generate code for the target device, we need to allocate
681 // struct for aggregate params in the device default alloca address space.
682 // OpenMP runtime requires that the params of the extracted functions are
683 // passed as zero address space pointers. This flag ensures that
684 // CodeExtractor generates correct code for extracted functions
685 // which are used by OpenMP runtime.
686 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
687 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
688 /* AggregateArgs */ true,
689 /* BlockFrequencyInfo */ nullptr,
690 /* BranchProbabilityInfo */ nullptr,
691 /* AssumptionCache */ nullptr,
692 /* AllowVarArgs */ true,
693 /* AllowAlloca */ true,
694 /* AllocaBlock*/ OI.OuterAllocaBB,
695 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
696
697 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
698 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
699 << " Exit: " << OI.ExitBB->getName() << "\n");
700 assert(Extractor.isEligible() &&
701 "Expected OpenMP outlining to be possible!");
702
703 for (auto *V : OI.ExcludeArgsFromAggregate)
704 Extractor.excludeArgFromAggregate(V);
705
706 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
707
708 // Forward target-cpu, target-features attributes to the outlined function.
709 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
710 if (TargetCpuAttr.isStringAttribute())
711 OutlinedFn->addFnAttr(TargetCpuAttr);
712
713 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
714 if (TargetFeaturesAttr.isStringAttribute())
715 OutlinedFn->addFnAttr(TargetFeaturesAttr);
716
717 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
718 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
719 assert(OutlinedFn->getReturnType()->isVoidTy() &&
720 "OpenMP outlined functions should not return a value!");
721
722 // For compability with the clang CG we move the outlined function after the
723 // one with the parallel region.
724 OutlinedFn->removeFromParent();
725 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
726
727 // Remove the artificial entry introduced by the extractor right away, we
728 // made our own entry block after all.
729 {
730 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
731 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
732 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
733 // Move instructions from the to-be-deleted ArtificialEntry to the entry
734 // basic block of the parallel region. CodeExtractor generates
735 // instructions to unwrap the aggregate argument and may sink
736 // allocas/bitcasts for values that are solely used in the outlined region
737 // and do not escape.
738 assert(!ArtificialEntry.empty() &&
739 "Expected instructions to add in the outlined region entry");
740 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
741 End = ArtificialEntry.rend();
742 It != End;) {
743 Instruction &I = *It;
744 It++;
745
746 if (I.isTerminator())
747 continue;
748
749 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
750 }
751
752 OI.EntryBB->moveBefore(&ArtificialEntry);
753 ArtificialEntry.eraseFromParent();
754 }
755 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
756 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
757
758 // Run a user callback, e.g. to add attributes.
759 if (OI.PostOutlineCB)
760 OI.PostOutlineCB(*OutlinedFn);
761 }
762
763 // Remove work items that have been completed.
764 OutlineInfos = std::move(DeferredOutlines);
765
766 // The createTarget functions embeds user written code into
767 // the target region which may inject allocas which need to
768 // be moved to the entry block of our target or risk malformed
769 // optimisations by later passes, this is only relevant for
770 // the device pass which appears to be a little more delicate
771 // when it comes to optimisations (however, we do not block on
772 // that here, it's up to the inserter to the list to do so).
773 // This notbaly has to occur after the OutlinedInfo candidates
774 // have been extracted so we have an end product that will not
775 // be implicitly adversely affected by any raises unless
776 // intentionally appended to the list.
777 // NOTE: This only does so for ConstantData, it could be extended
778 // to ConstantExpr's with further effort, however, they should
779 // largely be folded when they get here. Extending it to runtime
780 // defined/read+writeable allocation sizes would be non-trivial
781 // (need to factor in movement of any stores to variables the
782 // allocation size depends on, as well as the usual loads,
783 // otherwise it'll yield the wrong result after movement) and
784 // likely be more suitable as an LLVM optimisation pass.
787
788 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
789 [](EmitMetadataErrorKind Kind,
790 const TargetRegionEntryInfo &EntryInfo) -> void {
791 errs() << "Error of kind: " << Kind
792 << " when emitting offload entries and metadata during "
793 "OMPIRBuilder finalization \n";
794 };
795
798
799 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
800 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
801 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
802 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
803 }
804}
805
807 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
808}
809
812 auto *GV =
813 new GlobalVariable(M, I32Ty,
814 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
815 ConstantInt::get(I32Ty, Value), Name);
816 GV->setVisibility(GlobalValue::HiddenVisibility);
817
818 return GV;
819}
820
822 uint32_t SrcLocStrSize,
823 IdentFlag LocFlags,
824 unsigned Reserve2Flags) {
825 // Enable "C-mode".
826 LocFlags |= OMP_IDENT_FLAG_KMPC;
827
828 Constant *&Ident =
829 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
830 if (!Ident) {
832 Constant *IdentData[] = {I32Null,
833 ConstantInt::get(Int32, uint32_t(LocFlags)),
834 ConstantInt::get(Int32, Reserve2Flags),
835 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
836 Constant *Initializer =
837 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
838
839 // Look for existing encoding of the location + flags, not needed but
840 // minimizes the difference to the existing solution while we transition.
841 for (GlobalVariable &GV : M.globals())
842 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
843 if (GV.getInitializer() == Initializer)
844 Ident = &GV;
845
846 if (!Ident) {
847 auto *GV = new GlobalVariable(
848 M, OpenMPIRBuilder::Ident,
849 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
852 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
853 GV->setAlignment(Align(8));
854 Ident = GV;
855 }
856 }
857
859}
860
862 uint32_t &SrcLocStrSize) {
863 SrcLocStrSize = LocStr.size();
864 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
865 if (!SrcLocStr) {
866 Constant *Initializer =
868
869 // Look for existing encoding of the location, not needed but minimizes the
870 // difference to the existing solution while we transition.
871 for (GlobalVariable &GV : M.globals())
872 if (GV.isConstant() && GV.hasInitializer() &&
873 GV.getInitializer() == Initializer)
874 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
875
876 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
877 /* AddressSpace */ 0, &M);
878 }
879 return SrcLocStr;
880}
881
883 StringRef FileName,
884 unsigned Line, unsigned Column,
885 uint32_t &SrcLocStrSize) {
886 SmallString<128> Buffer;
887 Buffer.push_back(';');
888 Buffer.append(FileName);
889 Buffer.push_back(';');
890 Buffer.append(FunctionName);
891 Buffer.push_back(';');
892 Buffer.append(std::to_string(Line));
893 Buffer.push_back(';');
894 Buffer.append(std::to_string(Column));
895 Buffer.push_back(';');
896 Buffer.push_back(';');
897 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
898}
899
900Constant *
902 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
903 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
904}
905
907 uint32_t &SrcLocStrSize,
908 Function *F) {
909 DILocation *DIL = DL.get();
910 if (!DIL)
911 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
912 StringRef FileName = M.getName();
913 if (DIFile *DIF = DIL->getFile())
914 if (std::optional<StringRef> Source = DIF->getSource())
915 FileName = *Source;
916 StringRef Function = DIL->getScope()->getSubprogram()->getName();
917 if (Function.empty() && F)
918 Function = F->getName();
919 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
920 DIL->getColumn(), SrcLocStrSize);
921}
922
924 uint32_t &SrcLocStrSize) {
925 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
926 Loc.IP.getBlock()->getParent());
927}
928
930 return Builder.CreateCall(
931 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
932 "omp_global_thread_num");
933}
934
937 bool ForceSimpleCall, bool CheckCancelFlag) {
938 if (!updateToLocation(Loc))
939 return Loc.IP;
940
941 // Build call __kmpc_cancel_barrier(loc, thread_id) or
942 // __kmpc_barrier(loc, thread_id);
943
944 IdentFlag BarrierLocFlags;
945 switch (Kind) {
946 case OMPD_for:
947 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
948 break;
949 case OMPD_sections:
950 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
951 break;
952 case OMPD_single:
953 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
954 break;
955 case OMPD_barrier:
956 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
957 break;
958 default:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
960 break;
961 }
962
963 uint32_t SrcLocStrSize;
964 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
965 Value *Args[] = {
966 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
967 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
968
969 // If we are in a cancellable parallel region, barriers are cancellation
970 // points.
971 // TODO: Check why we would force simple calls or to ignore the cancel flag.
972 bool UseCancelBarrier =
973 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
974
975 Value *Result =
977 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
978 : OMPRTL___kmpc_barrier),
979 Args);
980
981 if (UseCancelBarrier && CheckCancelFlag)
982 emitCancelationCheckImpl(Result, OMPD_parallel);
983
984 return Builder.saveIP();
985}
986
989 Value *IfCondition,
990 omp::Directive CanceledDirective) {
991 if (!updateToLocation(Loc))
992 return Loc.IP;
993
994 // LLVM utilities like blocks with terminators.
995 auto *UI = Builder.CreateUnreachable();
996
997 Instruction *ThenTI = UI, *ElseTI = nullptr;
998 if (IfCondition)
999 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1000 Builder.SetInsertPoint(ThenTI);
1001
1002 Value *CancelKind = nullptr;
1003 switch (CanceledDirective) {
1004#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1005 case DirectiveEnum: \
1006 CancelKind = Builder.getInt32(Value); \
1007 break;
1008#include "llvm/Frontend/OpenMP/OMPKinds.def"
1009 default:
1010 llvm_unreachable("Unknown cancel kind!");
1011 }
1012
1013 uint32_t SrcLocStrSize;
1014 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1015 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1016 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1017 Value *Result = Builder.CreateCall(
1018 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1019 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1020 if (CanceledDirective == OMPD_parallel) {
1022 Builder.restoreIP(IP);
1024 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1025 /* CheckCancelFlag */ false);
1026 }
1027 };
1028
1029 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1030 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1031
1032 // Update the insertion point and remove the terminator we introduced.
1033 Builder.SetInsertPoint(UI->getParent());
1034 UI->eraseFromParent();
1035
1036 return Builder.saveIP();
1037}
1038
1040 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1041 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1042 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1043 if (!updateToLocation(Loc))
1044 return Loc.IP;
1045
1046 Builder.restoreIP(AllocaIP);
1047 auto *KernelArgsPtr =
1048 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1049 Builder.restoreIP(Loc.IP);
1050
1051 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1052 llvm::Value *Arg =
1053 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1055 KernelArgs[I], Arg,
1056 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1057 }
1058
1059 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1060 NumThreads, HostPtr, KernelArgsPtr};
1061
1062 Return = Builder.CreateCall(
1063 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1064 OffloadingArgs);
1065
1066 return Builder.saveIP();
1067}
1068
1070 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1071 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1072 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1073
1074 if (!updateToLocation(Loc))
1075 return Loc.IP;
1076
1077 Builder.restoreIP(Loc.IP);
1078 // On top of the arrays that were filled up, the target offloading call
1079 // takes as arguments the device id as well as the host pointer. The host
1080 // pointer is used by the runtime library to identify the current target
1081 // region, so it only has to be unique and not necessarily point to
1082 // anything. It could be the pointer to the outlined function that
1083 // implements the target region, but we aren't using that so that the
1084 // compiler doesn't need to keep that, and could therefore inline the host
1085 // function if proven worthwhile during optimization.
1086
1087 // From this point on, we need to have an ID of the target region defined.
1088 assert(OutlinedFnID && "Invalid outlined function ID!");
1089 (void)OutlinedFnID;
1090
1091 // Return value of the runtime offloading call.
1092 Value *Return = nullptr;
1093
1094 // Arguments for the target kernel.
1095 SmallVector<Value *> ArgsVector;
1096 getKernelArgsVector(Args, Builder, ArgsVector);
1097
1098 // The target region is an outlined function launched by the runtime
1099 // via calls to __tgt_target_kernel().
1100 //
1101 // Note that on the host and CPU targets, the runtime implementation of
1102 // these calls simply call the outlined function without forking threads.
1103 // The outlined functions themselves have runtime calls to
1104 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1105 // the compiler in emitTeamsCall() and emitParallelCall().
1106 //
1107 // In contrast, on the NVPTX target, the implementation of
1108 // __tgt_target_teams() launches a GPU kernel with the requested number
1109 // of teams and threads so no additional calls to the runtime are required.
1110 // Check the error code and execute the host version if required.
1111 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1112 Args.NumTeams, Args.NumThreads,
1113 OutlinedFnID, ArgsVector));
1114
1115 BasicBlock *OffloadFailedBlock =
1116 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1117 BasicBlock *OffloadContBlock =
1118 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1120 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1121
1122 auto CurFn = Builder.GetInsertBlock()->getParent();
1123 emitBlock(OffloadFailedBlock, CurFn);
1124 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1125 emitBranch(OffloadContBlock);
1126 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1127 return Builder.saveIP();
1128}
1129
1131 omp::Directive CanceledDirective,
1132 FinalizeCallbackTy ExitCB) {
1133 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1134 "Unexpected cancellation!");
1135
1136 // For a cancel barrier we create two new blocks.
1138 BasicBlock *NonCancellationBlock;
1139 if (Builder.GetInsertPoint() == BB->end()) {
1140 // TODO: This branch will not be needed once we moved to the
1141 // OpenMPIRBuilder codegen completely.
1142 NonCancellationBlock = BasicBlock::Create(
1143 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1144 } else {
1145 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1148 }
1149 BasicBlock *CancellationBlock = BasicBlock::Create(
1150 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1151
1152 // Jump to them based on the return value.
1153 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1154 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1155 /* TODO weight */ nullptr, nullptr);
1156
1157 // From the cancellation block we finalize all variables and go to the
1158 // post finalization block that is known to the FiniCB callback.
1159 Builder.SetInsertPoint(CancellationBlock);
1160 if (ExitCB)
1161 ExitCB(Builder.saveIP());
1162 auto &FI = FinalizationStack.back();
1163 FI.FiniCB(Builder.saveIP());
1164
1165 // The continuation block is where code generation continues.
1166 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1167}
1168
1169// Callback used to create OpenMP runtime calls to support
1170// omp parallel clause for the device.
1171// We need to use this callback to replace call to the OutlinedFn in OuterFn
1172// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1174 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1175 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1176 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1177 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1178 // Add some known attributes.
1179 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1180 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1181 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1182 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1183 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1184 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1185
1186 assert(OutlinedFn.arg_size() >= 2 &&
1187 "Expected at least tid and bounded tid as arguments");
1188 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1189
1190 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1191 assert(CI && "Expected call instruction to outlined function");
1192 CI->getParent()->setName("omp_parallel");
1193
1194 Builder.SetInsertPoint(CI);
1195 Type *PtrTy = OMPIRBuilder->VoidPtr;
1196 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1197
1198 // Add alloca for kernel args
1199 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1200 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1201 AllocaInst *ArgsAlloca =
1202 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1203 Value *Args = ArgsAlloca;
1204 // Add address space cast if array for storing arguments is not allocated
1205 // in address space 0
1206 if (ArgsAlloca->getAddressSpace())
1207 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1208 Builder.restoreIP(CurrentIP);
1209
1210 // Store captured vars which are used by kmpc_parallel_51
1211 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1212 Value *V = *(CI->arg_begin() + 2 + Idx);
1213 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1214 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1215 Builder.CreateStore(V, StoreAddress);
1216 }
1217
1218 Value *Cond =
1219 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1220 : Builder.getInt32(1);
1221
1222 // Build kmpc_parallel_51 call
1223 Value *Parallel51CallArgs[] = {
1224 /* identifier*/ Ident,
1225 /* global thread num*/ ThreadID,
1226 /* if expression */ Cond,
1227 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1228 /* Proc bind */ Builder.getInt32(-1),
1229 /* outlined function */
1230 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1231 /* wrapper function */ NullPtrValue,
1232 /* arguments of the outlined funciton*/ Args,
1233 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1234
1235 FunctionCallee RTLFn =
1236 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1237
1238 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1239
1240 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1241 << *Builder.GetInsertBlock()->getParent() << "\n");
1242
1243 // Initialize the local TID stack location with the argument value.
1244 Builder.SetInsertPoint(PrivTID);
1245 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1246 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1247 PrivTIDAddr);
1248
1249 // Remove redundant call to the outlined function.
1250 CI->eraseFromParent();
1251
1252 for (Instruction *I : ToBeDeleted) {
1253 I->eraseFromParent();
1254 }
1255}
1256
1257// Callback used to create OpenMP runtime calls to support
1258// omp parallel clause for the host.
1259// We need to use this callback to replace call to the OutlinedFn in OuterFn
1260// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1261static void
1263 Function *OuterFn, Value *Ident, Value *IfCondition,
1264 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1265 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1266 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1267 FunctionCallee RTLFn;
1268 if (IfCondition) {
1269 RTLFn =
1270 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1271 } else {
1272 RTLFn =
1273 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1274 }
1275 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1276 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1277 LLVMContext &Ctx = F->getContext();
1278 MDBuilder MDB(Ctx);
1279 // Annotate the callback behavior of the __kmpc_fork_call:
1280 // - The callback callee is argument number 2 (microtask).
1281 // - The first two arguments of the callback callee are unknown (-1).
1282 // - All variadic arguments to the __kmpc_fork_call are passed to the
1283 // callback callee.
1284 F->addMetadata(LLVMContext::MD_callback,
1286 2, {-1, -1},
1287 /* VarArgsArePassed */ true)}));
1288 }
1289 }
1290 // Add some known attributes.
1291 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1292 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1293 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1294
1295 assert(OutlinedFn.arg_size() >= 2 &&
1296 "Expected at least tid and bounded tid as arguments");
1297 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1298
1299 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1300 CI->getParent()->setName("omp_parallel");
1301 Builder.SetInsertPoint(CI);
1302
1303 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1304 Value *ForkCallArgs[] = {
1305 Ident, Builder.getInt32(NumCapturedVars),
1306 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1307
1308 SmallVector<Value *, 16> RealArgs;
1309 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1310 if (IfCondition) {
1311 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1312 RealArgs.push_back(Cond);
1313 }
1314 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1315
1316 // __kmpc_fork_call_if always expects a void ptr as the last argument
1317 // If there are no arguments, pass a null pointer.
1318 auto PtrTy = OMPIRBuilder->VoidPtr;
1319 if (IfCondition && NumCapturedVars == 0) {
1320 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1321 RealArgs.push_back(NullPtrValue);
1322 }
1323 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1324 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1325
1326 Builder.CreateCall(RTLFn, RealArgs);
1327
1328 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1329 << *Builder.GetInsertBlock()->getParent() << "\n");
1330
1331 // Initialize the local TID stack location with the argument value.
1332 Builder.SetInsertPoint(PrivTID);
1333 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1334 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1335 PrivTIDAddr);
1336
1337 // Remove redundant call to the outlined function.
1338 CI->eraseFromParent();
1339
1340 for (Instruction *I : ToBeDeleted) {
1341 I->eraseFromParent();
1342 }
1343}
1344
1346 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1347 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1348 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1349 omp::ProcBindKind ProcBind, bool IsCancellable) {
1350 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1351
1352 if (!updateToLocation(Loc))
1353 return Loc.IP;
1354
1355 uint32_t SrcLocStrSize;
1356 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1357 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1358 Value *ThreadID = getOrCreateThreadID(Ident);
1359 // If we generate code for the target device, we need to allocate
1360 // struct for aggregate params in the device default alloca address space.
1361 // OpenMP runtime requires that the params of the extracted functions are
1362 // passed as zero address space pointers. This flag ensures that extracted
1363 // function arguments are declared in zero address space
1364 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1365
1366 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1367 // only if we compile for host side.
1368 if (NumThreads && !Config.isTargetDevice()) {
1369 Value *Args[] = {
1370 Ident, ThreadID,
1371 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1373 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1374 }
1375
1376 if (ProcBind != OMP_PROC_BIND_default) {
1377 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1378 Value *Args[] = {
1379 Ident, ThreadID,
1380 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1382 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1383 }
1384
1385 BasicBlock *InsertBB = Builder.GetInsertBlock();
1386 Function *OuterFn = InsertBB->getParent();
1387
1388 // Save the outer alloca block because the insertion iterator may get
1389 // invalidated and we still need this later.
1390 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1391
1392 // Vector to remember instructions we used only during the modeling but which
1393 // we want to delete at the end.
1395
1396 // Change the location to the outer alloca insertion point to create and
1397 // initialize the allocas we pass into the parallel region.
1398 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1399 Builder.restoreIP(NewOuter);
1400 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1401 AllocaInst *ZeroAddrAlloca =
1402 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1403 Instruction *TIDAddr = TIDAddrAlloca;
1404 Instruction *ZeroAddr = ZeroAddrAlloca;
1405 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1406 // Add additional casts to enforce pointers in zero address space
1407 TIDAddr = new AddrSpaceCastInst(
1408 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1409 TIDAddr->insertAfter(TIDAddrAlloca);
1410 ToBeDeleted.push_back(TIDAddr);
1411 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1412 PointerType ::get(M.getContext(), 0),
1413 "zero.addr.ascast");
1414 ZeroAddr->insertAfter(ZeroAddrAlloca);
1415 ToBeDeleted.push_back(ZeroAddr);
1416 }
1417
1418 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1419 // associated arguments in the outlined function, so we delete them later.
1420 ToBeDeleted.push_back(TIDAddrAlloca);
1421 ToBeDeleted.push_back(ZeroAddrAlloca);
1422
1423 // Create an artificial insertion point that will also ensure the blocks we
1424 // are about to split are not degenerated.
1425 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1426
1427 BasicBlock *EntryBB = UI->getParent();
1428 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1429 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1430 BasicBlock *PRegPreFiniBB =
1431 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1432 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1433
1434 auto FiniCBWrapper = [&](InsertPointTy IP) {
1435 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1436 // target to the region exit block.
1437 if (IP.getBlock()->end() == IP.getPoint()) {
1439 Builder.restoreIP(IP);
1440 Instruction *I = Builder.CreateBr(PRegExitBB);
1441 IP = InsertPointTy(I->getParent(), I->getIterator());
1442 }
1443 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1444 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1445 "Unexpected insertion point for finalization call!");
1446 return FiniCB(IP);
1447 };
1448
1449 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1450
1451 // Generate the privatization allocas in the block that will become the entry
1452 // of the outlined function.
1453 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1454 InsertPointTy InnerAllocaIP = Builder.saveIP();
1455
1456 AllocaInst *PrivTIDAddr =
1457 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1458 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1459
1460 // Add some fake uses for OpenMP provided arguments.
1461 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1462 Instruction *ZeroAddrUse =
1463 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1464 ToBeDeleted.push_back(ZeroAddrUse);
1465
1466 // EntryBB
1467 // |
1468 // V
1469 // PRegionEntryBB <- Privatization allocas are placed here.
1470 // |
1471 // V
1472 // PRegionBodyBB <- BodeGen is invoked here.
1473 // |
1474 // V
1475 // PRegPreFiniBB <- The block we will start finalization from.
1476 // |
1477 // V
1478 // PRegionExitBB <- A common exit to simplify block collection.
1479 //
1480
1481 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1482
1483 // Let the caller create the body.
1484 assert(BodyGenCB && "Expected body generation callback!");
1485 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1486 BodyGenCB(InnerAllocaIP, CodeGenIP);
1487
1488 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1489
1490 OutlineInfo OI;
1491 if (Config.isTargetDevice()) {
1492 // Generate OpenMP target specific runtime call
1493 OI.PostOutlineCB = [=, ToBeDeletedVec =
1494 std::move(ToBeDeleted)](Function &OutlinedFn) {
1495 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1496 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1497 ThreadID, ToBeDeletedVec);
1498 };
1499 } else {
1500 // Generate OpenMP host runtime call
1501 OI.PostOutlineCB = [=, ToBeDeletedVec =
1502 std::move(ToBeDeleted)](Function &OutlinedFn) {
1503 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1504 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1505 };
1506 }
1507
1508 OI.OuterAllocaBB = OuterAllocaBlock;
1509 OI.EntryBB = PRegEntryBB;
1510 OI.ExitBB = PRegExitBB;
1511
1512 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1514 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1515
1516 // Ensure a single exit node for the outlined region by creating one.
1517 // We might have multiple incoming edges to the exit now due to finalizations,
1518 // e.g., cancel calls that cause the control flow to leave the region.
1519 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1520 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1521 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1522 Blocks.push_back(PRegOutlinedExitBB);
1523
1524 CodeExtractorAnalysisCache CEAC(*OuterFn);
1525 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1526 /* AggregateArgs */ false,
1527 /* BlockFrequencyInfo */ nullptr,
1528 /* BranchProbabilityInfo */ nullptr,
1529 /* AssumptionCache */ nullptr,
1530 /* AllowVarArgs */ true,
1531 /* AllowAlloca */ true,
1532 /* AllocationBlock */ OuterAllocaBlock,
1533 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1534
1535 // Find inputs to, outputs from the code region.
1536 BasicBlock *CommonExit = nullptr;
1537 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1538 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1539 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1540
1541 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1542
1543 FunctionCallee TIDRTLFn =
1544 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1545
1546 auto PrivHelper = [&](Value &V) {
1547 if (&V == TIDAddr || &V == ZeroAddr) {
1548 OI.ExcludeArgsFromAggregate.push_back(&V);
1549 return;
1550 }
1551
1553 for (Use &U : V.uses())
1554 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1555 if (ParallelRegionBlockSet.count(UserI->getParent()))
1556 Uses.insert(&U);
1557
1558 // __kmpc_fork_call expects extra arguments as pointers. If the input
1559 // already has a pointer type, everything is fine. Otherwise, store the
1560 // value onto stack and load it back inside the to-be-outlined region. This
1561 // will ensure only the pointer will be passed to the function.
1562 // FIXME: if there are more than 15 trailing arguments, they must be
1563 // additionally packed in a struct.
1564 Value *Inner = &V;
1565 if (!V.getType()->isPointerTy()) {
1567 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1568
1569 Builder.restoreIP(OuterAllocaIP);
1570 Value *Ptr =
1571 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1572
1573 // Store to stack at end of the block that currently branches to the entry
1574 // block of the to-be-outlined region.
1575 Builder.SetInsertPoint(InsertBB,
1576 InsertBB->getTerminator()->getIterator());
1577 Builder.CreateStore(&V, Ptr);
1578
1579 // Load back next to allocations in the to-be-outlined region.
1580 Builder.restoreIP(InnerAllocaIP);
1581 Inner = Builder.CreateLoad(V.getType(), Ptr);
1582 }
1583
1584 Value *ReplacementValue = nullptr;
1585 CallInst *CI = dyn_cast<CallInst>(&V);
1586 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1587 ReplacementValue = PrivTID;
1588 } else {
1590 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1591 InnerAllocaIP = {
1592 InnerAllocaIP.getBlock(),
1593 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1594
1595 assert(ReplacementValue &&
1596 "Expected copy/create callback to set replacement value!");
1597 if (ReplacementValue == &V)
1598 return;
1599 }
1600
1601 for (Use *UPtr : Uses)
1602 UPtr->set(ReplacementValue);
1603 };
1604
1605 // Reset the inner alloca insertion as it will be used for loading the values
1606 // wrapped into pointers before passing them into the to-be-outlined region.
1607 // Configure it to insert immediately after the fake use of zero address so
1608 // that they are available in the generated body and so that the
1609 // OpenMP-related values (thread ID and zero address pointers) remain leading
1610 // in the argument list.
1611 InnerAllocaIP = IRBuilder<>::InsertPoint(
1612 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1613
1614 // Reset the outer alloca insertion point to the entry of the relevant block
1615 // in case it was invalidated.
1616 OuterAllocaIP = IRBuilder<>::InsertPoint(
1617 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1618
1619 for (Value *Input : Inputs) {
1620 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1621 PrivHelper(*Input);
1622 }
1623 LLVM_DEBUG({
1624 for (Value *Output : Outputs)
1625 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1626 });
1627 assert(Outputs.empty() &&
1628 "OpenMP outlining should not produce live-out values!");
1629
1630 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1631 LLVM_DEBUG({
1632 for (auto *BB : Blocks)
1633 dbgs() << " PBR: " << BB->getName() << "\n";
1634 });
1635
1636 // Adjust the finalization stack, verify the adjustment, and call the
1637 // finalize function a last time to finalize values between the pre-fini
1638 // block and the exit block if we left the parallel "the normal way".
1639 auto FiniInfo = FinalizationStack.pop_back_val();
1640 (void)FiniInfo;
1641 assert(FiniInfo.DK == OMPD_parallel &&
1642 "Unexpected finalization stack state!");
1643
1644 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1645
1646 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1647 FiniCB(PreFiniIP);
1648
1649 // Register the outlined info.
1650 addOutlineInfo(std::move(OI));
1651
1652 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1653 UI->eraseFromParent();
1654
1655 return AfterIP;
1656}
1657
1659 // Build call void __kmpc_flush(ident_t *loc)
1660 uint32_t SrcLocStrSize;
1661 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1662 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1663
1664 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1665}
1666
1668 if (!updateToLocation(Loc))
1669 return;
1670 emitFlush(Loc);
1671}
1672
1674 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1675 // global_tid);
1676 uint32_t SrcLocStrSize;
1677 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1678 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1679 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1680
1681 // Ignore return result until untied tasks are supported.
1682 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1683 Args);
1684}
1685
1687 if (!updateToLocation(Loc))
1688 return;
1689 emitTaskwaitImpl(Loc);
1690}
1691
1693 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1694 uint32_t SrcLocStrSize;
1695 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1696 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1698 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1699
1700 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1701 Args);
1702}
1703
1705 if (!updateToLocation(Loc))
1706 return;
1707 emitTaskyieldImpl(Loc);
1708}
1709
1712 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1713 bool Tied, Value *Final, Value *IfCondition,
1714 SmallVector<DependData> Dependencies) {
1715
1716 if (!updateToLocation(Loc))
1717 return InsertPointTy();
1718
1719 uint32_t SrcLocStrSize;
1720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1722 // The current basic block is split into four basic blocks. After outlining,
1723 // they will be mapped as follows:
1724 // ```
1725 // def current_fn() {
1726 // current_basic_block:
1727 // br label %task.exit
1728 // task.exit:
1729 // ; instructions after task
1730 // }
1731 // def outlined_fn() {
1732 // task.alloca:
1733 // br label %task.body
1734 // task.body:
1735 // ret void
1736 // }
1737 // ```
1738 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1739 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1740 BasicBlock *TaskAllocaBB =
1741 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1742
1743 InsertPointTy TaskAllocaIP =
1744 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1745 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1746 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1747
1748 OutlineInfo OI;
1749 OI.EntryBB = TaskAllocaBB;
1750 OI.OuterAllocaBB = AllocaIP.getBlock();
1751 OI.ExitBB = TaskExitBB;
1752
1753 // Add the thread ID argument.
1754 std::stack<Instruction *> ToBeDeleted;
1756 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1757
1758 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1759 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1760 // Replace the Stale CI by appropriate RTL function call.
1761 assert(OutlinedFn.getNumUses() == 1 &&
1762 "there must be a single user for the outlined function");
1763 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1764
1765 // HasShareds is true if any variables are captured in the outlined region,
1766 // false otherwise.
1767 bool HasShareds = StaleCI->arg_size() > 1;
1768 Builder.SetInsertPoint(StaleCI);
1769
1770 // Gather the arguments for emitting the runtime call for
1771 // @__kmpc_omp_task_alloc
1772 Function *TaskAllocFn =
1773 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1774
1775 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1776 // call.
1777 Value *ThreadID = getOrCreateThreadID(Ident);
1778
1779 // Argument - `flags`
1780 // Task is tied iff (Flags & 1) == 1.
1781 // Task is untied iff (Flags & 1) == 0.
1782 // Task is final iff (Flags & 2) == 2.
1783 // Task is not final iff (Flags & 2) == 0.
1784 // TODO: Handle the other flags.
1785 Value *Flags = Builder.getInt32(Tied);
1786 if (Final) {
1787 Value *FinalFlag =
1789 Flags = Builder.CreateOr(FinalFlag, Flags);
1790 }
1791
1792 // Argument - `sizeof_kmp_task_t` (TaskSize)
1793 // Tasksize refers to the size in bytes of kmp_task_t data structure
1794 // including private vars accessed in task.
1795 // TODO: add kmp_task_t_with_privates (privates)
1796 Value *TaskSize = Builder.getInt64(
1798
1799 // Argument - `sizeof_shareds` (SharedsSize)
1800 // SharedsSize refers to the shareds array size in the kmp_task_t data
1801 // structure.
1802 Value *SharedsSize = Builder.getInt64(0);
1803 if (HasShareds) {
1804 AllocaInst *ArgStructAlloca =
1805 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1806 assert(ArgStructAlloca &&
1807 "Unable to find the alloca instruction corresponding to arguments "
1808 "for extracted function");
1809 StructType *ArgStructType =
1810 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1811 assert(ArgStructType && "Unable to find struct type corresponding to "
1812 "arguments for extracted function");
1813 SharedsSize =
1815 }
1816 // Emit the @__kmpc_omp_task_alloc runtime call
1817 // The runtime call returns a pointer to an area where the task captured
1818 // variables must be copied before the task is run (TaskData)
1819 CallInst *TaskData = Builder.CreateCall(
1820 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1821 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1822 /*task_func=*/&OutlinedFn});
1823
1824 // Copy the arguments for outlined function
1825 if (HasShareds) {
1826 Value *Shareds = StaleCI->getArgOperand(1);
1827 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1828 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1829 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1830 SharedsSize);
1831 }
1832
1833 Value *DepArray = nullptr;
1834 if (Dependencies.size()) {
1835 InsertPointTy OldIP = Builder.saveIP();
1837 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1838
1839 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1840 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1841
1842 unsigned P = 0;
1843 for (const DependData &Dep : Dependencies) {
1844 Value *Base =
1845 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1846 // Store the pointer to the variable
1848 DependInfo, Base,
1849 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1850 Value *DepValPtr =
1852 Builder.CreateStore(DepValPtr, Addr);
1853 // Store the size of the variable
1855 DependInfo, Base,
1856 static_cast<unsigned int>(RTLDependInfoFields::Len));
1858 Dep.DepValueType)),
1859 Size);
1860 // Store the dependency kind
1862 DependInfo, Base,
1863 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1865 ConstantInt::get(Builder.getInt8Ty(),
1866 static_cast<unsigned int>(Dep.DepKind)),
1867 Flags);
1868 ++P;
1869 }
1870
1871 Builder.restoreIP(OldIP);
1872 }
1873
1874 // In the presence of the `if` clause, the following IR is generated:
1875 // ...
1876 // %data = call @__kmpc_omp_task_alloc(...)
1877 // br i1 %if_condition, label %then, label %else
1878 // then:
1879 // call @__kmpc_omp_task(...)
1880 // br label %exit
1881 // else:
1882 // ;; Wait for resolution of dependencies, if any, before
1883 // ;; beginning the task
1884 // call @__kmpc_omp_wait_deps(...)
1885 // call @__kmpc_omp_task_begin_if0(...)
1886 // call @outlined_fn(...)
1887 // call @__kmpc_omp_task_complete_if0(...)
1888 // br label %exit
1889 // exit:
1890 // ...
1891 if (IfCondition) {
1892 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1893 // terminator.
1894 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1895 Instruction *IfTerminator =
1896 Builder.GetInsertPoint()->getParent()->getTerminator();
1897 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1898 Builder.SetInsertPoint(IfTerminator);
1899 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1900 &ElseTI);
1901 Builder.SetInsertPoint(ElseTI);
1902
1903 if (Dependencies.size()) {
1904 Function *TaskWaitFn =
1905 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1907 TaskWaitFn,
1908 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1909 ConstantInt::get(Builder.getInt32Ty(), 0),
1911 }
1912 Function *TaskBeginFn =
1913 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1914 Function *TaskCompleteFn =
1915 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1916 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1917 CallInst *CI = nullptr;
1918 if (HasShareds)
1919 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1920 else
1921 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1922 CI->setDebugLoc(StaleCI->getDebugLoc());
1923 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1924 Builder.SetInsertPoint(ThenTI);
1925 }
1926
1927 if (Dependencies.size()) {
1928 Function *TaskFn =
1929 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
1931 TaskFn,
1932 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
1933 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
1935
1936 } else {
1937 // Emit the @__kmpc_omp_task runtime call to spawn the task
1938 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
1939 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
1940 }
1941
1942 StaleCI->eraseFromParent();
1943
1944 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
1945 if (HasShareds) {
1946 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
1947 OutlinedFn.getArg(1)->replaceUsesWithIf(
1948 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
1949 }
1950
1951 while (!ToBeDeleted.empty()) {
1952 ToBeDeleted.top()->eraseFromParent();
1953 ToBeDeleted.pop();
1954 }
1955 };
1956
1957 addOutlineInfo(std::move(OI));
1958 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
1959
1960 return Builder.saveIP();
1961}
1962
1965 InsertPointTy AllocaIP,
1966 BodyGenCallbackTy BodyGenCB) {
1967 if (!updateToLocation(Loc))
1968 return InsertPointTy();
1969
1970 uint32_t SrcLocStrSize;
1971 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1972 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1973 Value *ThreadID = getOrCreateThreadID(Ident);
1974
1975 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1976 Function *TaskgroupFn =
1977 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
1978 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
1979
1980 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
1981 BodyGenCB(AllocaIP, Builder.saveIP());
1982
1983 Builder.SetInsertPoint(TaskgroupExitBB);
1984 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1985 Function *EndTaskgroupFn =
1986 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
1987 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
1988
1989 return Builder.saveIP();
1990}
1991
1993 const LocationDescription &Loc, InsertPointTy AllocaIP,
1995 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
1996 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
1997
1998 if (!updateToLocation(Loc))
1999 return Loc.IP;
2000
2001 auto FiniCBWrapper = [&](InsertPointTy IP) {
2002 if (IP.getBlock()->end() != IP.getPoint())
2003 return FiniCB(IP);
2004 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2005 // will fail because that function requires the Finalization Basic Block to
2006 // have a terminator, which is already removed by EmitOMPRegionBody.
2007 // IP is currently at cancelation block.
2008 // We need to backtrack to the condition block to fetch
2009 // the exit block and create a branch from cancelation
2010 // to exit block.
2012 Builder.restoreIP(IP);
2013 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2014 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2015 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2016 Instruction *I = Builder.CreateBr(ExitBB);
2017 IP = InsertPointTy(I->getParent(), I->getIterator());
2018 return FiniCB(IP);
2019 };
2020
2021 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2022
2023 // Each section is emitted as a switch case
2024 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2025 // -> OMP.createSection() which generates the IR for each section
2026 // Iterate through all sections and emit a switch construct:
2027 // switch (IV) {
2028 // case 0:
2029 // <SectionStmt[0]>;
2030 // break;
2031 // ...
2032 // case <NumSection> - 1:
2033 // <SectionStmt[<NumSection> - 1]>;
2034 // break;
2035 // }
2036 // ...
2037 // section_loop.after:
2038 // <FiniCB>;
2039 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2040 Builder.restoreIP(CodeGenIP);
2042 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2043 Function *CurFn = Continue->getParent();
2044 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2045
2046 unsigned CaseNumber = 0;
2047 for (auto SectionCB : SectionCBs) {
2049 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2050 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2051 Builder.SetInsertPoint(CaseBB);
2052 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2053 SectionCB(InsertPointTy(),
2054 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2055 CaseNumber++;
2056 }
2057 // remove the existing terminator from body BB since there can be no
2058 // terminators after switch/case
2059 };
2060 // Loop body ends here
2061 // LowerBound, UpperBound, and STride for createCanonicalLoop
2062 Type *I32Ty = Type::getInt32Ty(M.getContext());
2063 Value *LB = ConstantInt::get(I32Ty, 0);
2064 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2065 Value *ST = ConstantInt::get(I32Ty, 1);
2067 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2068 InsertPointTy AfterIP =
2069 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2070
2071 // Apply the finalization callback in LoopAfterBB
2072 auto FiniInfo = FinalizationStack.pop_back_val();
2073 assert(FiniInfo.DK == OMPD_sections &&
2074 "Unexpected finalization stack state!");
2075 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2076 Builder.restoreIP(AfterIP);
2077 BasicBlock *FiniBB =
2078 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2079 CB(Builder.saveIP());
2080 AfterIP = {FiniBB, FiniBB->begin()};
2081 }
2082
2083 return AfterIP;
2084}
2085
2088 BodyGenCallbackTy BodyGenCB,
2089 FinalizeCallbackTy FiniCB) {
2090 if (!updateToLocation(Loc))
2091 return Loc.IP;
2092
2093 auto FiniCBWrapper = [&](InsertPointTy IP) {
2094 if (IP.getBlock()->end() != IP.getPoint())
2095 return FiniCB(IP);
2096 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2097 // will fail because that function requires the Finalization Basic Block to
2098 // have a terminator, which is already removed by EmitOMPRegionBody.
2099 // IP is currently at cancelation block.
2100 // We need to backtrack to the condition block to fetch
2101 // the exit block and create a branch from cancelation
2102 // to exit block.
2104 Builder.restoreIP(IP);
2105 auto *CaseBB = Loc.IP.getBlock();
2106 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2107 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2108 Instruction *I = Builder.CreateBr(ExitBB);
2109 IP = InsertPointTy(I->getParent(), I->getIterator());
2110 return FiniCB(IP);
2111 };
2112
2113 Directive OMPD = Directive::OMPD_sections;
2114 // Since we are using Finalization Callback here, HasFinalize
2115 // and IsCancellable have to be true
2116 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2117 /*Conditional*/ false, /*hasFinalize*/ true,
2118 /*IsCancellable*/ true);
2119}
2120
2123 IT++;
2124 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2125}
2126
2127void OpenMPIRBuilder::emitUsed(StringRef Name,
2128 std::vector<WeakTrackingVH> &List) {
2129 if (List.empty())
2130 return;
2131
2132 // Convert List to what ConstantArray needs.
2134 UsedArray.resize(List.size());
2135 for (unsigned I = 0, E = List.size(); I != E; ++I)
2137 cast<Constant>(&*List[I]), Builder.getPtrTy());
2138
2139 if (UsedArray.empty())
2140 return;
2141 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2142
2143 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2144 ConstantArray::get(ATy, UsedArray), Name);
2145
2146 GV->setSection("llvm.metadata");
2147}
2148
2149Value *OpenMPIRBuilder::getGPUThreadID() {
2150 return Builder.CreateCall(
2152 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2153 {});
2154}
2155
2156Value *OpenMPIRBuilder::getGPUWarpSize() {
2157 return Builder.CreateCall(
2158 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2159}
2160
2161Value *OpenMPIRBuilder::getNVPTXWarpID() {
2162 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2163 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2164}
2165
2166Value *OpenMPIRBuilder::getNVPTXLaneID() {
2167 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2168 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2169 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2170 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2171 "nvptx_lane_id");
2172}
2173
2174Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2175 Type *ToType) {
2176 Type *FromType = From->getType();
2177 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2178 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2179 assert(FromSize > 0 && "From size must be greater than zero");
2180 assert(ToSize > 0 && "To size must be greater than zero");
2181 if (FromType == ToType)
2182 return From;
2183 if (FromSize == ToSize)
2184 return Builder.CreateBitCast(From, ToType);
2185 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2186 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2187 InsertPointTy SaveIP = Builder.saveIP();
2188 Builder.restoreIP(AllocaIP);
2189 Value *CastItem = Builder.CreateAlloca(ToType);
2190 Builder.restoreIP(SaveIP);
2191
2193 CastItem, FromType->getPointerTo());
2194 Builder.CreateStore(From, ValCastItem);
2195 return Builder.CreateLoad(ToType, CastItem);
2196}
2197
2198Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2199 Value *Element,
2200 Type *ElementType,
2201 Value *Offset) {
2202 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2203 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2204
2205 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2206 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2207 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2208 Value *WarpSize =
2209 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2211 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2212 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2213 Value *WarpSizeCast =
2214 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2215 Value *ShuffleCall =
2216 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2217 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2218}
2219
2220void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2221 Value *DstAddr, Type *ElemType,
2222 Value *Offset, Type *ReductionArrayTy) {
2224 // Create the loop over the big sized data.
2225 // ptr = (void*)Elem;
2226 // ptrEnd = (void*) Elem + 1;
2227 // Step = 8;
2228 // while (ptr + Step < ptrEnd)
2229 // shuffle((int64_t)*ptr);
2230 // Step = 4;
2231 // while (ptr + Step < ptrEnd)
2232 // shuffle((int32_t)*ptr);
2233 // ...
2234 Type *IndexTy = Builder.getIndexTy(
2236 Value *ElemPtr = DstAddr;
2237 Value *Ptr = SrcAddr;
2238 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2239 if (Size < IntSize)
2240 continue;
2241 Type *IntType = Builder.getIntNTy(IntSize * 8);
2243 Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
2244 Value *SrcAddrGEP =
2245 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2247 ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
2248
2249 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2250 if ((Size / IntSize) > 1) {
2252 SrcAddrGEP, Builder.getPtrTy());
2253 BasicBlock *PreCondBB =
2254 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2255 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2256 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2257 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2258 emitBlock(PreCondBB, CurFunc);
2259 PHINode *PhiSrc =
2260 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2261 PhiSrc->addIncoming(Ptr, CurrentBB);
2262 PHINode *PhiDest =
2263 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2264 PhiDest->addIncoming(ElemPtr, CurrentBB);
2265 Ptr = PhiSrc;
2266 ElemPtr = PhiDest;
2267 Value *PtrDiff = Builder.CreatePtrDiff(
2268 Builder.getInt8Ty(), PtrEnd,
2271 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2272 ExitBB);
2273 emitBlock(ThenBB, CurFunc);
2274 Value *Res = createRuntimeShuffleFunction(
2275 AllocaIP,
2277 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2278 IntType, Offset);
2279 Builder.CreateAlignedStore(Res, ElemPtr,
2280 M.getDataLayout().getPrefTypeAlign(ElemType));
2281 Value *LocalPtr =
2282 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2283 Value *LocalElemPtr =
2284 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2285 PhiSrc->addIncoming(LocalPtr, ThenBB);
2286 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2287 emitBranch(PreCondBB);
2288 emitBlock(ExitBB, CurFunc);
2289 } else {
2290 Value *Res = createRuntimeShuffleFunction(
2291 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2292 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2293 Res->getType()->getScalarSizeInBits())
2294 Res = Builder.CreateTrunc(Res, ElemType);
2295 Builder.CreateStore(Res, ElemPtr);
2296 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2297 ElemPtr =
2298 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2299 }
2300 Size = Size % IntSize;
2301 }
2302}
2303
2304void OpenMPIRBuilder::emitReductionListCopy(
2305 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2306 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2307 CopyOptionsTy CopyOptions) {
2308 Type *IndexTy = Builder.getIndexTy(
2310 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2311
2312 // Iterates, element-by-element, through the source Reduce list and
2313 // make a copy.
2314 for (auto En : enumerate(ReductionInfos)) {
2315 const ReductionInfo &RI = En.value();
2316 Value *SrcElementAddr = nullptr;
2317 Value *DestElementAddr = nullptr;
2318 Value *DestElementPtrAddr = nullptr;
2319 // Should we shuffle in an element from a remote lane?
2320 bool ShuffleInElement = false;
2321 // Set to true to update the pointer in the dest Reduce list to a
2322 // newly created element.
2323 bool UpdateDestListPtr = false;
2324
2325 // Step 1.1: Get the address for the src element in the Reduce list.
2326 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2327 ReductionArrayTy, SrcBase,
2328 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2329 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2330
2331 // Step 1.2: Create a temporary to store the element in the destination
2332 // Reduce list.
2333 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2334 ReductionArrayTy, DestBase,
2335 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2336 switch (Action) {
2338 InsertPointTy CurIP = Builder.saveIP();
2339 Builder.restoreIP(AllocaIP);
2340 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2341 ".omp.reduction.element");
2342 DestAlloca->setAlignment(
2343 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2344 DestElementAddr = DestAlloca;
2345 DestElementAddr =
2346 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2347 DestElementAddr->getName() + ".ascast");
2348 Builder.restoreIP(CurIP);
2349 ShuffleInElement = true;
2350 UpdateDestListPtr = true;
2351 break;
2352 }
2354 DestElementAddr =
2355 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2356 break;
2357 }
2358 }
2359
2360 // Now that all active lanes have read the element in the
2361 // Reduce list, shuffle over the value from the remote lane.
2362 if (ShuffleInElement) {
2363 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2364 RemoteLaneOffset, ReductionArrayTy);
2365 } else {
2366 switch (RI.EvaluationKind) {
2367 case EvalKind::Scalar: {
2368 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2369 // Store the source element value to the dest element address.
2370 Builder.CreateStore(Elem, DestElementAddr);
2371 break;
2372 }
2373 case EvalKind::Complex: {
2375 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2376 Value *SrcReal = Builder.CreateLoad(
2377 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2379 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2380 Value *SrcImg = Builder.CreateLoad(
2381 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2382
2384 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2386 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2387 Builder.CreateStore(SrcReal, DestRealPtr);
2388 Builder.CreateStore(SrcImg, DestImgPtr);
2389 break;
2390 }
2391 case EvalKind::Aggregate: {
2392 Value *SizeVal = Builder.getInt64(
2393 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2395 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2396 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2397 SizeVal, false);
2398 break;
2399 }
2400 };
2401 }
2402
2403 // Step 3.1: Modify reference in dest Reduce list as needed.
2404 // Modifying the reference in Reduce list to point to the newly
2405 // created element. The element is live in the current function
2406 // scope and that of functions it invokes (i.e., reduce_function).
2407 // RemoteReduceData[i] = (void*)&RemoteElem
2408 if (UpdateDestListPtr) {
2410 DestElementAddr, Builder.getPtrTy(),
2411 DestElementAddr->getName() + ".ascast");
2412 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2413 }
2414 }
2415}
2416
2417Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
2418 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2419 AttributeList FuncAttrs) {
2420 InsertPointTy SavedIP = Builder.saveIP();
2421 LLVMContext &Ctx = M.getContext();
2423 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2424 /* IsVarArg */ false);
2425 Function *WcFunc =
2427 "_omp_reduction_inter_warp_copy_func", &M);
2428 WcFunc->setAttributes(FuncAttrs);
2429 WcFunc->addParamAttr(0, Attribute::NoUndef);
2430 WcFunc->addParamAttr(1, Attribute::NoUndef);
2431 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2432 Builder.SetInsertPoint(EntryBB);
2433
2434 // ReduceList: thread local Reduce list.
2435 // At the stage of the computation when this function is called, partially
2436 // aggregated values reside in the first lane of every active warp.
2437 Argument *ReduceListArg = WcFunc->getArg(0);
2438 // NumWarps: number of warps active in the parallel region. This could
2439 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2440 Argument *NumWarpsArg = WcFunc->getArg(1);
2441
2442 // This array is used as a medium to transfer, one reduce element at a time,
2443 // the data from the first lane of every warp to lanes in the first warp
2444 // in order to perform the final step of a reduction in a parallel region
2445 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2446 // for reduced latency, as well as to have a distinct copy for concurrently
2447 // executing target regions. The array is declared with common linkage so
2448 // as to be shared across compilation units.
2449 StringRef TransferMediumName =
2450 "__openmp_nvptx_data_transfer_temporary_storage";
2451 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2452 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2453 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2454 if (!TransferMedium) {
2455 TransferMedium = new GlobalVariable(
2456 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2457 UndefValue::get(ArrayTy), TransferMediumName,
2458 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2459 /*AddressSpace=*/3);
2460 }
2461
2462 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2463 Value *GPUThreadID = getGPUThreadID();
2464 // nvptx_lane_id = nvptx_id % warpsize
2465 Value *LaneID = getNVPTXLaneID();
2466 // nvptx_warp_id = nvptx_id / warpsize
2467 Value *WarpID = getNVPTXWarpID();
2468
2469 InsertPointTy AllocaIP =
2472 Type *Arg0Type = ReduceListArg->getType();
2473 Type *Arg1Type = NumWarpsArg->getType();
2474 Builder.restoreIP(AllocaIP);
2475 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2476 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2477 AllocaInst *NumWarpsAlloca =
2478 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2480 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2482 NumWarpsAlloca, Arg1Type->getPointerTo(),
2483 NumWarpsAlloca->getName() + ".ascast");
2484 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2485 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2486 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2487 InsertPointTy CodeGenIP =
2489 Builder.restoreIP(CodeGenIP);
2490
2491 Value *ReduceList =
2492 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2493
2494 for (auto En : enumerate(ReductionInfos)) {
2495 //
2496 // Warp master copies reduce element to transfer medium in __shared__
2497 // memory.
2498 //
2499 const ReductionInfo &RI = En.value();
2500 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2501 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2502 Type *CType = Builder.getIntNTy(TySize * 8);
2503
2504 unsigned NumIters = RealTySize / TySize;
2505 if (NumIters == 0)
2506 continue;
2507 Value *Cnt = nullptr;
2508 Value *CntAddr = nullptr;
2509 BasicBlock *PrecondBB = nullptr;
2510 BasicBlock *ExitBB = nullptr;
2511 if (NumIters > 1) {
2512 CodeGenIP = Builder.saveIP();
2513 Builder.restoreIP(AllocaIP);
2514 CntAddr =
2515 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2516
2517 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2518 CntAddr->getName() + ".ascast");
2519 Builder.restoreIP(CodeGenIP);
2521 CntAddr,
2522 /*Volatile=*/false);
2523 PrecondBB = BasicBlock::Create(Ctx, "precond");
2524 ExitBB = BasicBlock::Create(Ctx, "exit");
2525 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2526 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2527 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2528 /*Volatile=*/false);
2530 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2531 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2533 }
2534
2535 // kmpc_barrier.
2536 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2537 omp::Directive::OMPD_unknown,
2538 /* ForceSimpleCall */ false,
2539 /* CheckCancelFlag */ true);
2540 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2541 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2542 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2543
2544 // if (lane_id == 0)
2545 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2546 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2548
2549 // Reduce element = LocalReduceList[i]
2550 auto *RedListArrayTy =
2551 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2552 Type *IndexTy = Builder.getIndexTy(
2554 Value *ElemPtrPtr =
2555 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2556 {ConstantInt::get(IndexTy, 0),
2557 ConstantInt::get(IndexTy, En.index())});
2558 // elemptr = ((CopyType*)(elemptrptr)) + I
2559 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2560 if (NumIters > 1)
2561 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2562
2563 // Get pointer to location in transfer medium.
2564 // MediumPtr = &medium[warp_id]
2565 Value *MediumPtr = Builder.CreateInBoundsGEP(
2566 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2567 // elem = *elemptr
2568 //*MediumPtr = elem
2569 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2570 // Store the source element value to the dest element address.
2571 Builder.CreateStore(Elem, MediumPtr,
2572 /*IsVolatile*/ true);
2573 Builder.CreateBr(MergeBB);
2574
2575 // else
2577 Builder.CreateBr(MergeBB);
2578
2579 // endif
2581 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2582 omp::Directive::OMPD_unknown,
2583 /* ForceSimpleCall */ false,
2584 /* CheckCancelFlag */ true);
2585
2586 // Warp 0 copies reduce element from transfer medium
2587 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2588 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2589 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2590
2591 Value *NumWarpsVal =
2592 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2593 // Up to 32 threads in warp 0 are active.
2594 Value *IsActiveThread =
2595 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2596 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2597
2598 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2599
2600 // SecMediumPtr = &medium[tid]
2601 // SrcMediumVal = *SrcMediumPtr
2602 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2603 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2604 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2605 Value *TargetElemPtrPtr =
2606 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2607 {ConstantInt::get(IndexTy, 0),
2608 ConstantInt::get(IndexTy, En.index())});
2609 Value *TargetElemPtrVal =
2610 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2611 Value *TargetElemPtr = TargetElemPtrVal;
2612 if (NumIters > 1)
2613 TargetElemPtr =
2614 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2615
2616 // *TargetElemPtr = SrcMediumVal;
2617 Value *SrcMediumValue =
2618 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2619 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2620 Builder.CreateBr(W0MergeBB);
2621
2622 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2623 Builder.CreateBr(W0MergeBB);
2624
2625 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2626
2627 if (NumIters > 1) {
2628 Cnt = Builder.CreateNSWAdd(
2629 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2630 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2631
2632 auto *CurFn = Builder.GetInsertBlock()->getParent();
2633 emitBranch(PrecondBB);
2634 emitBlock(ExitBB, CurFn);
2635 }
2636 RealTySize %= TySize;
2637 }
2638 }
2639
2641 Builder.restoreIP(SavedIP);
2642
2643 return WcFunc;
2644}
2645
2646Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2647 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2648 AttributeList FuncAttrs) {
2649 LLVMContext &Ctx = M.getContext();
2650 FunctionType *FuncTy =
2652 {Builder.getPtrTy(), Builder.getInt16Ty(),
2653 Builder.getInt16Ty(), Builder.getInt16Ty()},
2654 /* IsVarArg */ false);
2655 Function *SarFunc =
2657 "_omp_reduction_shuffle_and_reduce_func", &M);
2658 SarFunc->setAttributes(FuncAttrs);
2659 SarFunc->addParamAttr(0, Attribute::NoUndef);
2660 SarFunc->addParamAttr(1, Attribute::NoUndef);
2661 SarFunc->addParamAttr(2, Attribute::NoUndef);
2662 SarFunc->addParamAttr(3, Attribute::NoUndef);
2663 SarFunc->addParamAttr(1, Attribute::SExt);
2664 SarFunc->addParamAttr(2, Attribute::SExt);
2665 SarFunc->addParamAttr(3, Attribute::SExt);
2666 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2667 Builder.SetInsertPoint(EntryBB);
2668
2669 // Thread local Reduce list used to host the values of data to be reduced.
2670 Argument *ReduceListArg = SarFunc->getArg(0);
2671 // Current lane id; could be logical.
2672 Argument *LaneIDArg = SarFunc->getArg(1);
2673 // Offset of the remote source lane relative to the current lane.
2674 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2675 // Algorithm version. This is expected to be known at compile time.
2676 Argument *AlgoVerArg = SarFunc->getArg(3);
2677
2678 Type *ReduceListArgType = ReduceListArg->getType();
2679 Type *LaneIDArgType = LaneIDArg->getType();
2680 Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
2681 Value *ReduceListAlloca = Builder.CreateAlloca(
2682 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2683 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2684 LaneIDArg->getName() + ".addr");
2685 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2686 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2687 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2688 AlgoVerArg->getName() + ".addr");
2689 ArrayType *RedListArrayTy =
2690 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2691
2692 // Create a local thread-private variable to host the Reduce list
2693 // from a remote lane.
2694 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2695 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2696
2698 ReduceListAlloca, ReduceListArgType,
2699 ReduceListAlloca->getName() + ".ascast");
2701 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2702 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2703 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2704 RemoteLaneOffsetAlloca->getName() + ".ascast");
2706 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2708 RemoteReductionListAlloca, Builder.getPtrTy(),
2709 RemoteReductionListAlloca->getName() + ".ascast");
2710
2711 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2712 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2713 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2714 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2715
2716 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2717 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2718 Value *RemoteLaneOffset =
2719 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2720 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2721
2722 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2723
2724 // This loop iterates through the list of reduce elements and copies,
2725 // element by element, from a remote lane in the warp to RemoteReduceList,
2726 // hosted on the thread's stack.
2727 emitReductionListCopy(
2728 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2729 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2730
2731 // The actions to be performed on the Remote Reduce list is dependent
2732 // on the algorithm version.
2733 //
2734 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2735 // LaneId % 2 == 0 && Offset > 0):
2736 // do the reduction value aggregation
2737 //
2738 // The thread local variable Reduce list is mutated in place to host the
2739 // reduced data, which is the aggregated value produced from local and
2740 // remote lanes.
2741 //
2742 // Note that AlgoVer is expected to be a constant integer known at compile
2743 // time.
2744 // When AlgoVer==0, the first conjunction evaluates to true, making
2745 // the entire predicate true during compile time.
2746 // When AlgoVer==1, the second conjunction has only the second part to be
2747 // evaluated during runtime. Other conjunctions evaluates to false
2748 // during compile time.
2749 // When AlgoVer==2, the third conjunction has only the second part to be
2750 // evaluated during runtime. Other conjunctions evaluates to false
2751 // during compile time.
2752 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2753 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2754 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2755 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2756 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2757 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2758 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2759 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2760 Value *RemoteOffsetComp =
2761 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2762 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2763 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2764 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2765
2766 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2767 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2768 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2769
2770 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2773 ReduceList, Builder.getPtrTy());
2774 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2775 RemoteListAddrCast, Builder.getPtrTy());
2776 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2777 ->addFnAttr(Attribute::NoUnwind);
2778 Builder.CreateBr(MergeBB);
2779
2781 Builder.CreateBr(MergeBB);
2782
2784
2785 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2786 // Reduce list.
2787 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2788 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2789 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2790
2791 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2792 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2793 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2794 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2795
2796 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2797 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2798 ReductionInfos, RemoteListAddrCast, ReduceList);
2799 Builder.CreateBr(CpyMergeBB);
2800
2801 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2802 Builder.CreateBr(CpyMergeBB);
2803
2804 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2805
2807
2808 return SarFunc;
2809}
2810
2811Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2812 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2813 AttributeList FuncAttrs) {
2815 LLVMContext &Ctx = M.getContext();
2818 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2819 /* IsVarArg */ false);
2820 Function *LtGCFunc =
2822 "_omp_reduction_list_to_global_copy_func", &M);
2823 LtGCFunc->setAttributes(FuncAttrs);
2824 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2825 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2826 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2827
2828 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2829 Builder.SetInsertPoint(EntryBlock);
2830
2831 // Buffer: global reduction buffer.
2832 Argument *BufferArg = LtGCFunc->getArg(0);
2833 // Idx: index of the buffer.
2834 Argument *IdxArg = LtGCFunc->getArg(1);
2835 // ReduceList: thread local Reduce list.
2836 Argument *ReduceListArg = LtGCFunc->getArg(2);
2837
2838 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2839 BufferArg->getName() + ".addr");
2840 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2841 IdxArg->getName() + ".addr");
2842 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2843 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2845 BufferArgAlloca, Builder.getPtrTy(),
2846 BufferArgAlloca->getName() + ".ascast");
2848 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2849 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2850 ReduceListArgAlloca, Builder.getPtrTy(),
2851 ReduceListArgAlloca->getName() + ".ascast");
2852
2853 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2854 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2855 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2856
2857 Value *LocalReduceList =
2858 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2859 Value *BufferArgVal =
2860 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2861 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2862 Type *IndexTy = Builder.getIndexTy(
2864 for (auto En : enumerate(ReductionInfos)) {
2865 const ReductionInfo &RI = En.value();
2866 auto *RedListArrayTy =
2867 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2868 // Reduce element = LocalReduceList[i]
2869 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
2870 RedListArrayTy, LocalReduceList,
2871 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2872 // elemptr = ((CopyType*)(elemptrptr)) + I
2873 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2874
2875 // Global = Buffer.VD[Idx];
2876 Value *BufferVD =
2877 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
2879 ReductionsBufferTy, BufferVD, 0, En.index());
2880
2881 switch (RI.EvaluationKind) {
2882 case EvalKind::Scalar: {
2883 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
2884 Builder.CreateStore(TargetElement, GlobVal);
2885 break;
2886 }
2887 case EvalKind::Complex: {
2889 RI.ElementType, ElemPtr, 0, 0, ".realp");
2890 Value *SrcReal = Builder.CreateLoad(
2891 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2893 RI.ElementType, ElemPtr, 0, 1, ".imagp");
2894 Value *SrcImg = Builder.CreateLoad(
2895 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2896
2898 RI.ElementType, GlobVal, 0, 0, ".realp");
2900 RI.ElementType, GlobVal, 0, 1, ".imagp");
2901 Builder.CreateStore(SrcReal, DestRealPtr);
2902 Builder.CreateStore(SrcImg, DestImgPtr);
2903 break;
2904 }
2905 case EvalKind::Aggregate: {
2906 Value *SizeVal =
2907 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
2909 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
2910 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
2911 break;
2912 }
2913 }
2914 }
2915
2917 Builder.restoreIP(OldIP);
2918 return LtGCFunc;
2919}
2920
2921Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
2922 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2923 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
2925 LLVMContext &Ctx = M.getContext();
2928 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2929 /* IsVarArg */ false);
2930 Function *LtGRFunc =
2932 "_omp_reduction_list_to_global_reduce_func", &M);
2933 LtGRFunc->setAttributes(FuncAttrs);
2934 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
2935 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
2936 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
2937
2938 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
2939 Builder.SetInsertPoint(EntryBlock);
2940
2941 // Buffer: global reduction buffer.
2942 Argument *BufferArg = LtGRFunc->getArg(0);
2943 // Idx: index of the buffer.
2944 Argument *IdxArg = LtGRFunc->getArg(1);
2945 // ReduceList: thread local Reduce list.
2946 Argument *ReduceListArg = LtGRFunc->getArg(2);
2947
2948 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2949 BufferArg->getName() + ".addr");
2950 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2951 IdxArg->getName() + ".addr");
2952 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2953 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2954 auto *RedListArrayTy =
2955 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2956
2957 // 1. Build a list of reduction variables.
2958 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2959 Value *LocalReduceList =
2960 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
2961
2963 BufferArgAlloca, Builder.getPtrTy(),
2964 BufferArgAlloca->getName() + ".ascast");
2966 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2967 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2968 ReduceListArgAlloca, Builder.getPtrTy(),
2969 ReduceListArgAlloca->getName() + ".ascast");
2970 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2971 LocalReduceList, Builder.getPtrTy(),
2972 LocalReduceList->getName() + ".ascast");
2973
2974 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2975 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2976 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2977
2978 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2979 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2980 Type *IndexTy = Builder.getIndexTy(
2982 for (auto En : enumerate(ReductionInfos)) {
2983 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
2984 RedListArrayTy, LocalReduceListAddrCast,
2985 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2986 Value *BufferVD =
2987 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
2988 // Global = Buffer.VD[Idx];
2990 ReductionsBufferTy, BufferVD, 0, En.index());
2991 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
2992 }
2993
2994 // Call reduce_function(GlobalReduceList, ReduceList)
2995 Value *ReduceList =
2996 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2997 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
2998 ->addFnAttr(Attribute::NoUnwind);
3000 Builder.restoreIP(OldIP);
3001 return LtGRFunc;
3002}
3003
3004Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3005 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3006 AttributeList FuncAttrs) {
3008 LLVMContext &Ctx = M.getContext();
3011 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3012 /* IsVarArg */ false);
3013 Function *LtGCFunc =
3015 "_omp_reduction_global_to_list_copy_func", &M);
3016 LtGCFunc->setAttributes(FuncAttrs);
3017 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3018 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3019 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3020
3021 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3022 Builder.SetInsertPoint(EntryBlock);
3023
3024 // Buffer: global reduction buffer.
3025 Argument *BufferArg = LtGCFunc->getArg(0);
3026 // Idx: index of the buffer.
3027 Argument *IdxArg = LtGCFunc->getArg(1);
3028 // ReduceList: thread local Reduce list.
3029 Argument *ReduceListArg = LtGCFunc->getArg(2);
3030
3031 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3032 BufferArg->getName() + ".addr");
3033 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3034 IdxArg->getName() + ".addr");
3035 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3036 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3038 BufferArgAlloca, Builder.getPtrTy(),
3039 BufferArgAlloca->getName() + ".ascast");
3041 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3042 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3043 ReduceListArgAlloca, Builder.getPtrTy(),
3044 ReduceListArgAlloca->getName() + ".ascast");
3045 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3046 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3047 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3048
3049 Value *LocalReduceList =
3050 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3051 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3052 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3053 Type *IndexTy = Builder.getIndexTy(
3055 for (auto En : enumerate(ReductionInfos)) {
3056 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3057 auto *RedListArrayTy =
3058 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3059 // Reduce element = LocalReduceList[i]
3060 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3061 RedListArrayTy, LocalReduceList,
3062 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3063 // elemptr = ((CopyType*)(elemptrptr)) + I
3064 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3065 // Global = Buffer.VD[Idx];
3066 Value *BufferVD =
3067 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3069 ReductionsBufferTy, BufferVD, 0, En.index());
3070
3071 switch (RI.EvaluationKind) {
3072 case EvalKind::Scalar: {
3073 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3074 Builder.CreateStore(TargetElement, ElemPtr);
3075 break;
3076 }
3077 case EvalKind::Complex: {
3079 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3080 Value *SrcReal = Builder.CreateLoad(
3081 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3083 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3084 Value *SrcImg = Builder.CreateLoad(
3085 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3086
3088 RI.ElementType, ElemPtr, 0, 0, ".realp");
3090 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3091 Builder.CreateStore(SrcReal, DestRealPtr);
3092 Builder.CreateStore(SrcImg, DestImgPtr);
3093 break;
3094 }
3095 case EvalKind::Aggregate: {
3096 Value *SizeVal =
3100 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3101 SizeVal, false);
3102 break;
3103 }
3104 }
3105 }
3106
3108 Builder.restoreIP(OldIP);
3109 return LtGCFunc;
3110}
3111
3112Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3113 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3114 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3116 LLVMContext &Ctx = M.getContext();
3117 auto *FuncTy = FunctionType::get(
3119 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3120 /* IsVarArg */ false);
3121 Function *LtGRFunc =
3123 "_omp_reduction_global_to_list_reduce_func", &M);
3124 LtGRFunc->setAttributes(FuncAttrs);
3125 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3126 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3127 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3128
3129 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3130 Builder.SetInsertPoint(EntryBlock);
3131
3132 // Buffer: global reduction buffer.
3133 Argument *BufferArg = LtGRFunc->getArg(0);
3134 // Idx: index of the buffer.
3135 Argument *IdxArg = LtGRFunc->getArg(1);
3136 // ReduceList: thread local Reduce list.
3137 Argument *ReduceListArg = LtGRFunc->getArg(2);
3138
3139 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3140 BufferArg->getName() + ".addr");
3141 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3142 IdxArg->getName() + ".addr");
3143 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3144 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3145 ArrayType *RedListArrayTy =
3146 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3147
3148 // 1. Build a list of reduction variables.
3149 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3150 Value *LocalReduceList =
3151 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3152
3154 BufferArgAlloca, Builder.getPtrTy(),
3155 BufferArgAlloca->getName() + ".ascast");
3157 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3158 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3159 ReduceListArgAlloca, Builder.getPtrTy(),
3160 ReduceListArgAlloca->getName() + ".ascast");
3162 LocalReduceList, Builder.getPtrTy(),
3163 LocalReduceList->getName() + ".ascast");
3164
3165 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3166 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3167 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3168
3169 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3170 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3171 Type *IndexTy = Builder.getIndexTy(
3173 for (auto En : enumerate(ReductionInfos)) {
3174 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3175 RedListArrayTy, ReductionList,
3176 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3177 // Global = Buffer.VD[Idx];
3178 Value *BufferVD =
3179 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3181 ReductionsBufferTy, BufferVD, 0, En.index());
3182 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3183 }
3184
3185 // Call reduce_function(ReduceList, GlobalReduceList)
3186 Value *ReduceList =
3187 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3188 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3189 ->addFnAttr(Attribute::NoUnwind);
3191 Builder.restoreIP(OldIP);
3192 return LtGRFunc;
3193}
3194
3195std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3196 std::string Suffix =
3197 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3198 return (Name + Suffix).str();
3199}
3200
3201Function *OpenMPIRBuilder::createReductionFunction(
3202 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3203 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3204 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3205 {Builder.getPtrTy(), Builder.getPtrTy()},
3206 /* IsVarArg */ false);
3207 std::string Name = getReductionFuncName(ReducerName);
3208 Function *ReductionFunc =
3210 ReductionFunc->setAttributes(FuncAttrs);
3211 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3212 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3213 BasicBlock *EntryBB =
3214 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3215 Builder.SetInsertPoint(EntryBB);
3216
3217 // Need to alloca memory here and deal with the pointers before getting
3218 // LHS/RHS pointers out
3219 Value *LHSArrayPtr = nullptr;
3220 Value *RHSArrayPtr = nullptr;
3221 Argument *Arg0 = ReductionFunc->getArg(0);
3222 Argument *Arg1 = ReductionFunc->getArg(1);
3223 Type *Arg0Type = Arg0->getType();
3224 Type *Arg1Type = Arg1->getType();
3225
3226 Value *LHSAlloca =
3227 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3228 Value *RHSAlloca =
3229 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3231 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3233 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3234 Builder.CreateStore(Arg0, LHSAddrCast);
3235 Builder.CreateStore(Arg1, RHSAddrCast);
3236 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3237 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3238
3239 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3240 Type *IndexTy = Builder.getIndexTy(
3242 SmallVector<Value *> LHSPtrs, RHSPtrs;
3243 for (auto En : enumerate(ReductionInfos)) {
3244 const ReductionInfo &RI = En.value();
3245 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3246 RedArrayTy, RHSArrayPtr,
3247 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3248 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3250 RHSI8Ptr, RI.PrivateVariable->getType(),
3251 RHSI8Ptr->getName() + ".ascast");
3252
3253 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3254 RedArrayTy, LHSArrayPtr,
3255 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3256 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3258 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3259
3261 LHSPtrs.emplace_back(LHSPtr);
3262 RHSPtrs.emplace_back(RHSPtr);
3263 } else {
3264 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3265 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3266 Value *Reduced;
3267 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3268 if (!Builder.GetInsertBlock())
3269 return ReductionFunc;
3270 Builder.CreateStore(Reduced, LHSPtr);
3271 }
3272 }
3273
3275 for (auto En : enumerate(ReductionInfos)) {
3276 unsigned Index = En.index();
3277 const ReductionInfo &RI = En.value();
3278 Value *LHSFixupPtr, *RHSFixupPtr;
3279 Builder.restoreIP(RI.ReductionGenClang(
3280 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3281
3282 // Fix the CallBack code genereated to use the correct Values for the LHS
3283 // and RHS
3284 LHSFixupPtr->replaceUsesWithIf(
3285 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3286 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3287 ReductionFunc;
3288 });
3289 RHSFixupPtr->replaceUsesWithIf(
3290 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3291 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3292 ReductionFunc;
3293 });
3294 }
3295
3297 return ReductionFunc;
3298}
3299
3300static void
3302 bool IsGPU) {
3303 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3304 (void)RI;
3305 assert(RI.Variable && "expected non-null variable");
3306 assert(RI.PrivateVariable && "expected non-null private variable");
3307 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3308 "expected non-null reduction generator callback");
3309 if (!IsGPU) {
3310 assert(
3311 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3312 "expected variables and their private equivalents to have the same "
3313 "type");
3314 }
3315 assert(RI.Variable->getType()->isPointerTy() &&
3316 "expected variables to be pointers");
3317 }
3318}
3319
3321 const LocationDescription &Loc, InsertPointTy AllocaIP,
3322 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3323 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3324 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3325 unsigned ReductionBufNum, Value *SrcLocInfo) {
3326 if (!updateToLocation(Loc))
3327 return InsertPointTy();
3328 Builder.restoreIP(CodeGenIP);
3329 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3330 LLVMContext &Ctx = M.getContext();
3331
3332 // Source location for the ident struct
3333 if (!SrcLocInfo) {
3334 uint32_t SrcLocStrSize;
3335 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3336 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3337 }
3338
3339 if (ReductionInfos.size() == 0)
3340 return Builder.saveIP();
3341
3342 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3343 AttributeList FuncAttrs;
3344 AttrBuilder AttrBldr(Ctx);
3345 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3346 AttrBldr.addAttribute(Attr);
3347 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3348 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3349
3350 Function *ReductionFunc = nullptr;
3351 CodeGenIP = Builder.saveIP();
3352 ReductionFunc =
3353 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3354 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3355 Builder.restoreIP(CodeGenIP);
3356
3357 // Set the grid value in the config needed for lowering later on
3358 if (GridValue.has_value())
3359 Config.setGridValue(GridValue.value());
3360 else
3361 Config.setGridValue(getGridValue(T, ReductionFunc));
3362
3363 uint32_t SrcLocStrSize;
3364 Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
3365 Value *RTLoc =
3366 getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
3367
3368 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3369 // RedList, shuffle_reduce_func, interwarp_copy_func);
3370 // or
3371 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3372 Value *Res;
3373
3374 // 1. Build a list of reduction variables.
3375 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3376 auto Size = ReductionInfos.size();
3377 Type *PtrTy = PointerType::getUnqual(Ctx);
3378 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3379 CodeGenIP = Builder.saveIP();
3380 Builder.restoreIP(AllocaIP);
3381 Value *ReductionListAlloca =
3382 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3384 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3385 Builder.restoreIP(CodeGenIP);
3386 Type *IndexTy = Builder.getIndexTy(
3388 for (auto En : enumerate(ReductionInfos)) {
3389 const ReductionInfo &RI = En.value();
3390 Value *ElemPtr = Builder.CreateInBoundsGEP(
3391 RedArrayTy, ReductionList,
3392 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3393 Value *CastElem =
3395 Builder.CreateStore(CastElem, ElemPtr);
3396 }
3397 CodeGenIP = Builder.saveIP();
3398 Function *SarFunc =
3399 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3400 Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3401 Builder.restoreIP(CodeGenIP);
3402
3403 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3404
3405 unsigned MaxDataSize = 0;
3406 SmallVector<Type *> ReductionTypeArgs;
3407 for (auto En : enumerate(ReductionInfos)) {
3408 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3409 if (Size > MaxDataSize)
3410 MaxDataSize = Size;
3411 ReductionTypeArgs.emplace_back(En.value().ElementType);
3412 }
3413 Value *ReductionDataSize =
3414 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3415 if (!IsTeamsReduction) {
3416 Value *SarFuncCast =
3418 Value *WcFuncCast =
3420 Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
3422 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3423 Res = Builder.CreateCall(Pv2Ptr, Args);
3424 } else {
3425 CodeGenIP = Builder.saveIP();
3426 StructType *ReductionsBufferTy = StructType::create(
3427 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3428 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3429 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3430 Function *LtGCFunc = emitListToGlobalCopyFunction(
3431 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3432 Function *LtGRFunc = emitListToGlobalReduceFunction(
3433 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3434 Function *GtLCFunc = emitGlobalToListCopyFunction(
3435 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3436 Function *GtLRFunc = emitGlobalToListReduceFunction(
3437 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3438 Builder.restoreIP(CodeGenIP);
3439
3440 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3441 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3442
3443 Value *Args3[] = {RTLoc,
3444 KernelTeamsReductionPtr,
3445 Builder.getInt32(ReductionBufNum),
3446 ReductionDataSize,
3447 RL,
3448 SarFunc,
3449 WcFunc,
3450 LtGCFunc,
3451 LtGRFunc,
3452 GtLCFunc,
3453 GtLRFunc};
3454
3455 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3456 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3457 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3458 }
3459
3460 // 5. Build if (res == 1)
3461 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3462 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3464 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3465
3466 // 6. Build then branch: where we have reduced values in the master
3467 // thread in each team.
3468 // __kmpc_end_reduce{_nowait}(<gtid>);
3469 // break;
3470 emitBlock(ThenBB, CurFunc);
3471
3472 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3473 for (auto En : enumerate(ReductionInfos)) {
3474 const ReductionInfo &RI = En.value();
3475 Value *LHS = RI.Variable;
3476 Value *RHS =
3478
3480 Value *LHSPtr, *RHSPtr;
3482 &LHSPtr, &RHSPtr, CurFunc));
3483
3484 // Fix the CallBack code genereated to use the correct Values for the LHS
3485 // and RHS
3486 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3487 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3488 ReductionFunc;
3489 });
3490 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3491 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3492 ReductionFunc;
3493 });
3494 } else {
3495 assert(false && "Unhandled ReductionGenCBKind");
3496 }
3497 }
3498 emitBlock(ExitBB, CurFunc);
3499
3501
3502 return Builder.saveIP();
3503}
3504
3506 Type *VoidTy = Type::getVoidTy(M.getContext());
3507 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3508 auto *FuncTy =
3509 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3511 ".omp.reduction.func", &M);
3512}
3513
3516 InsertPointTy AllocaIP,
3517 ArrayRef<ReductionInfo> ReductionInfos,
3518 ArrayRef<bool> IsByRef, bool IsNoWait) {
3519 assert(ReductionInfos.size() == IsByRef.size());
3520 for (const ReductionInfo &RI : ReductionInfos) {
3521 (void)RI;
3522 assert(RI.Variable && "expected non-null variable");
3523 assert(RI.PrivateVariable && "expected non-null private variable");
3524 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3525 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3526 "expected variables and their private equivalents to have the same "
3527 "type");
3528 assert(RI.Variable->getType()->isPointerTy() &&
3529 "expected variables to be pointers");
3530 }
3531
3532 if (!updateToLocation(Loc))
3533 return InsertPointTy();
3534
3535 BasicBlock *InsertBlock = Loc.IP.getBlock();
3536 BasicBlock *ContinuationBlock =
3537 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3538 InsertBlock->getTerminator()->eraseFromParent();
3539
3540 // Create and populate array of type-erased pointers to private reduction
3541 // values.
3542 unsigned NumReductions = ReductionInfos.size();
3543 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3545 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3546
3547 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3548
3549 for (auto En : enumerate(ReductionInfos)) {
3550 unsigned Index = En.index();
3551 const ReductionInfo &RI = En.value();
3552 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3553 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3554 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3555 }
3556
3557 // Emit a call to the runtime function that orchestrates the reduction.
3558 // Declare the reduction function in the process.
3560 Module *Module = Func->getParent();
3561 uint32_t SrcLocStrSize;
3562 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3563 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3564 return RI.AtomicReductionGen;
3565 });
3566 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3567 CanGenerateAtomic
3568 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3569 : IdentFlag(0));
3570 Value *ThreadId = getOrCreateThreadID(Ident);
3571 Constant *NumVariables = Builder.getInt32(NumReductions);
3572 const DataLayout &DL = Module->getDataLayout();
3573 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3574 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3575 Function *ReductionFunc = getFreshReductionFunc(*Module);
3576 Value *Lock = getOMPCriticalRegionLock(".reduction");
3578 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3579 : RuntimeFunction::OMPRTL___kmpc_reduce);
3580 CallInst *ReduceCall =
3581 Builder.CreateCall(ReduceFunc,
3582 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3583 ReductionFunc, Lock},
3584 "reduce");
3585
3586 // Create final reduction entry blocks for the atomic and non-atomic case.
3587 // Emit IR that dispatches control flow to one of the blocks based on the
3588 // reduction supporting the atomic mode.
3589 BasicBlock *NonAtomicRedBlock =
3590 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3591 BasicBlock *AtomicRedBlock =
3592 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3593 SwitchInst *Switch =
3594 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3595 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3596 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3597
3598 // Populate the non-atomic reduction using the elementwise reduction function.
3599 // This loads the elements from the global and private variables and reduces
3600 // them before storing back the result to the global variable.
3601 Builder.SetInsertPoint(NonAtomicRedBlock);
3602 for (auto En : enumerate(ReductionInfos)) {
3603 const ReductionInfo &RI = En.value();
3605 // We have one less load for by-ref case because that load is now inside of
3606 // the reduction region
3607 Value *RedValue = nullptr;
3608 if (!IsByRef[En.index()]) {
3609 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3610 "red.value." + Twine(En.index()));
3611 }
3612 Value *PrivateRedValue =
3614 "red.private.value." + Twine(En.index()));
3615 Value *Reduced;
3616 if (IsByRef[En.index()]) {
3618 PrivateRedValue, Reduced));
3619 } else {
3621 PrivateRedValue, Reduced));
3622 }
3623 if (!Builder.GetInsertBlock())
3624 return InsertPointTy();
3625 // for by-ref case, the load is inside of the reduction region
3626 if (!IsByRef[En.index()])
3627 Builder.CreateStore(Reduced, RI.Variable);
3628 }
3629 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3630 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3631 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3632 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3633 Builder.CreateBr(ContinuationBlock);
3634
3635 // Populate the atomic reduction using the atomic elementwise reduction
3636 // function. There are no loads/stores here because they will be happening
3637 // inside the atomic elementwise reduction.
3638 Builder.SetInsertPoint(AtomicRedBlock);
3639 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3640 for (const ReductionInfo &RI : ReductionInfos) {
3642 RI.Variable, RI.PrivateVariable));
3643 if (!Builder.GetInsertBlock())
3644 return InsertPointTy();
3645 }
3646 Builder.CreateBr(ContinuationBlock);
3647 } else {
3649 }
3650
3651 // Populate the outlined reduction function using the elementwise reduction
3652 // function. Partial values are extracted from the type-erased array of
3653 // pointers to private variables.
3654 BasicBlock *ReductionFuncBlock =
3655 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3656 Builder.SetInsertPoint(ReductionFuncBlock);
3657 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3658 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3659
3660 for (auto En : enumerate(ReductionInfos)) {
3661 const ReductionInfo &RI = En.value();
3663 RedArrayTy, LHSArrayPtr, 0, En.index());
3664 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3665 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3666 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3668 RedArrayTy, RHSArrayPtr, 0, En.index());
3669 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3670 Value *RHSPtr =
3672 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3673 Value *Reduced;
3675 if (!Builder.GetInsertBlock())
3676 return InsertPointTy();
3677 // store is inside of the reduction region when using by-ref
3678 if (!IsByRef[En.index()])
3679 Builder.CreateStore(Reduced, LHSPtr);
3680 }
3682
3683 Builder.SetInsertPoint(ContinuationBlock);
3684 return Builder.saveIP();
3685}
3686
3689 BodyGenCallbackTy BodyGenCB,
3690 FinalizeCallbackTy FiniCB) {
3691
3692 if (!updateToLocation(Loc))
3693 return Loc.IP;
3694
3695 Directive OMPD = Directive::OMPD_master;
3696 uint32_t SrcLocStrSize;
3697 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3698 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3699 Value *ThreadId = getOrCreateThreadID(Ident);
3700 Value *Args[] = {Ident, ThreadId};
3701
3702 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3703 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3704
3705 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3706 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3707
3708 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3709 /*Conditional*/ true, /*hasFinalize*/ true);
3710}
3711
3714 BodyGenCallbackTy BodyGenCB,
3715 FinalizeCallbackTy FiniCB, Value *Filter) {
3716 if (!updateToLocation(Loc))
3717 return Loc.IP;
3718
3719 Directive OMPD = Directive::OMPD_masked;
3720 uint32_t SrcLocStrSize;
3721 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3722 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3723 Value *ThreadId = getOrCreateThreadID(Ident);
3724 Value *Args[] = {Ident, ThreadId, Filter};
3725 Value *ArgsEnd[] = {Ident, ThreadId};
3726
3727 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3728 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3729
3730 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3731 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3732
3733 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3734 /*Conditional*/ true, /*hasFinalize*/ true);
3735}
3736
3738 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3739 BasicBlock *PostInsertBefore, const Twine &Name) {
3740 Module *M = F->getParent();
3741 LLVMContext &Ctx = M->getContext();
3742 Type *IndVarTy = TripCount->getType();
3743
3744 // Create the basic block structure.
3745 BasicBlock *Preheader =
3746 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3747 BasicBlock *Header =
3748 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3749 BasicBlock *Cond =
3750 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3751 BasicBlock *Body =
3752 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3753 BasicBlock *Latch =
3754 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3755 BasicBlock *Exit =
3756 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3757 BasicBlock *After =
3758 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3759
3760 // Use specified DebugLoc for new instructions.
3762
3763 Builder.SetInsertPoint(Preheader);
3764 Builder.CreateBr(Header);
3765
3766 Builder.SetInsertPoint(Header);
3767 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3768 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3770
3772 Value *Cmp =
3773 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3774 Builder.CreateCondBr(Cmp, Body, Exit);
3775
3776 Builder.SetInsertPoint(Body);
3777 Builder.CreateBr(Latch);
3778
3779 Builder.SetInsertPoint(Latch);
3780 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3781 "omp_" + Name + ".next", /*HasNUW=*/true);
3782 Builder.CreateBr(Header);
3783 IndVarPHI->addIncoming(Next, Latch);
3784
3785 Builder.SetInsertPoint(Exit);
3787
3788 // Remember and return the canonical control flow.
3789 LoopInfos.emplace_front();
3790 CanonicalLoopInfo *CL = &LoopInfos.front();
3791
3792 CL->Header = Header;
3793 CL->Cond = Cond;
3794 CL->Latch = Latch;
3795 CL->Exit = Exit;
3796
3797#ifndef NDEBUG
3798 CL->assertOK();
3799#endif
3800 return CL;
3801}
3802
3805 LoopBodyGenCallbackTy BodyGenCB,
3806 Value *TripCount, const Twine &Name) {
3807 BasicBlock *BB = Loc.IP.getBlock();
3808 BasicBlock *NextBB = BB->getNextNode();
3809
3810 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3811 NextBB, NextBB, Name);
3812 BasicBlock *After = CL->getAfter();
3813
3814 // If location is not set, don't connect the loop.
3815 if (updateToLocation(Loc)) {
3816 // Split the loop at the insertion point: Branch to the preheader and move
3817 // every following instruction to after the loop (the After BB). Also, the
3818 // new successor is the loop's after block.
3819 spliceBB(Builder, After, /*CreateBranch=*/false);
3821 }
3822
3823 // Emit the body content. We do it after connecting the loop to the CFG to
3824 // avoid that the callback encounters degenerate BBs.
3825 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
3826
3827#ifndef NDEBUG
3828 CL->assertOK();
3829#endif
3830 return CL;
3831}
3832
3834 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3835 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3836 InsertPointTy ComputeIP, const Twine &Name) {
3837
3838 // Consider the following difficulties (assuming 8-bit signed integers):
3839 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3840 // DO I = 1, 100, 50
3841 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3842 // DO I = 100, 0, -128
3843
3844 // Start, Stop and Step must be of the same integer type.
3845 auto *IndVarTy = cast<IntegerType>(Start->getType());
3846 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
3847 assert(IndVarTy == Step->getType() && "Step type mismatch");
3848
3849 LocationDescription ComputeLoc =
3850 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
3851 updateToLocation(ComputeLoc);
3852
3853 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
3854 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
3855
3856 // Like Step, but always positive.
3857 Value *Incr = Step;
3858
3859 // Distance between Start and Stop; always positive.
3860 Value *Span;
3861
3862 // Condition whether there are no iterations are executed at all, e.g. because
3863 // UB < LB.
3864 Value *ZeroCmp;
3865
3866 if (IsSigned) {
3867 // Ensure that increment is positive. If not, negate and invert LB and UB.
3868 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
3869 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
3870 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
3871 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
3872 Span = Builder.CreateSub(UB, LB, "", false, true);
3873 ZeroCmp = Builder.CreateICmp(
3874 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
3875 } else {
3876 Span = Builder.CreateSub(Stop, Start, "", true);
3877 ZeroCmp = Builder.CreateICmp(
3878 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
3879 }
3880
3881 Value *CountIfLooping;
3882 if (InclusiveStop) {
3883 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
3884 } else {
3885 // Avoid incrementing past stop since it could overflow.
3886 Value *CountIfTwo = Builder.CreateAdd(
3887 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
3888 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
3889 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
3890 }
3891 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
3892 "omp_" + Name + ".tripcount");
3893
3894 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
3895 Builder.restoreIP(CodeGenIP);
3896 Value *Span = Builder.CreateMul(IV, Step);
3897 Value *IndVar = Builder.CreateAdd(Span, Start);
3898 BodyGenCB(Builder.saveIP(), IndVar);
3899 };
3900 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
3901 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
3902}
3903
3904// Returns an LLVM function to call for initializing loop bounds using OpenMP
3905// static scheduling depending on `type`. Only i32 and i64 are supported by the
3906// runtime. Always interpret integers as unsigned similarly to
3907// CanonicalLoopInfo.
3909 OpenMPIRBuilder &OMPBuilder) {
3910 unsigned Bitwidth = Ty->getIntegerBitWidth();
3911 if (Bitwidth == 32)
3912 return OMPBuilder.getOrCreateRuntimeFunction(
3913 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
3914 if (Bitwidth == 64)
3915 return OMPBuilder.getOrCreateRuntimeFunction(
3916 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
3917 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3918}
3919
3921OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
3922 InsertPointTy AllocaIP,
3923 bool NeedsBarrier) {
3924 assert(CLI->isValid() && "Requires a valid canonical loop");
3925 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3926 "Require dedicated allocate IP");
3927
3928 // Set up the source location value for OpenMP runtime.
3931
3932 uint32_t SrcLocStrSize;
3933 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
3934 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3935
3936 // Declare useful OpenMP runtime functions.
3937 Value *IV = CLI->getIndVar();
3938 Type *IVTy = IV->getType();
3939 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
3940 FunctionCallee StaticFini =
3941 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
3942
3943 // Allocate space for computed loop bounds as expected by the "init" function.
3944 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
3945
3946 Type *I32Type = Type::getInt32Ty(M.getContext());
3947 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
3948 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
3949 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
3950 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
3951
3952 // At the end of the preheader, prepare for calling the "init" function by
3953 // storing the current loop bounds into the allocated space. A canonical loop
3954 // always iterates from 0 to trip-count with step 1. Note that "init" expects
3955 // and produces an inclusive upper bound.
3957 Constant *Zero = ConstantInt::get(IVTy, 0);
3958 Constant *One = ConstantInt::get(IVTy, 1);
3959 Builder.CreateStore(Zero, PLowerBound);
3960 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
3961 Builder.CreateStore(UpperBound, PUpperBound);
3962 Builder.CreateStore(One, PStride);
3963
3964 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
3965
3966 Constant *SchedulingType = ConstantInt::get(
3967 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
3968
3969 // Call the "init" function and update the trip count of the loop with the
3970 // value it produced.
3971 Builder.CreateCall(StaticInit,
3972 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
3973 PUpperBound, PStride, One, Zero});
3974 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
3975 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
3976 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
3977 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
3978 CLI->setTripCount(TripCount);
3979
3980 // Update all uses of the induction variable except the one in the condition
3981 // block that compares it with the actual upper bound, and the increment in
3982 // the latch block.
3983
3984 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
3986 CLI->getBody()->getFirstInsertionPt());
3988 return Builder.CreateAdd(OldIV, LowerBound);
3989 });
3990
3991 // In the "exit" block, call the "fini" function.
3993 CLI->getExit()->getTerminator()->getIterator());
3994 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
3995
3996 // Add the barrier if requested.
3997 if (NeedsBarrier)
3998 createBarrier(LocationDescription(Builder.saveIP(), DL),
3999 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4000 /* CheckCancelFlag */ false);
4001
4002 InsertPointTy AfterIP = CLI->getAfterIP();
4003 CLI->invalidate();
4004
4005 return AfterIP;
4006}
4007
4008OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4009 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4010 bool NeedsBarrier, Value *ChunkSize) {
4011 assert(CLI->isValid() && "Requires a valid canonical loop");
4012 assert(ChunkSize && "Chunk size is required");
4013
4014 LLVMContext &Ctx = CLI->getFunction()->getContext();
4015 Value *IV = CLI->getIndVar();
4016 Value *OrigTripCount = CLI->getTripCount();
4017 Type *IVTy = IV->getType();
4018 assert(IVTy->getIntegerBitWidth() <= 64 &&
4019 "Max supported tripcount bitwidth is 64 bits");
4020 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4021 : Type::getInt64Ty(Ctx);
4022 Type *I32Type = Type::getInt32Ty(M.getContext());
4023 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4024 Constant *One = ConstantInt::get(InternalIVTy, 1);
4025
4026 // Declare useful OpenMP runtime functions.
4027 FunctionCallee StaticInit =
4028 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4029 FunctionCallee StaticFini =
4030 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4031
4032 // Allocate space for computed loop bounds as expected by the "init" function.
4033 Builder.restoreIP(AllocaIP);
4035 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4036 Value *PLowerBound =
4037 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4038 Value *PUpperBound =
4039 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4040 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4041
4042 // Set up the source location value for the OpenMP runtime.
4045
4046 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4047 Value *CastedChunkSize =
4048 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4049 Value *CastedTripCount =
4050 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4051
4052 Constant *SchedulingType = ConstantInt::get(
4053 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4054 Builder.CreateStore(Zero, PLowerBound);
4055 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4056 Builder.CreateStore(OrigUpperBound, PUpperBound);
4057 Builder.CreateStore(One, PStride);
4058
4059 // Call the "init" function and update the trip count of the loop with the
4060 // value it produced.
4061 uint32_t SrcLocStrSize;
4062 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4063 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4064 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4065 Builder.CreateCall(StaticInit,
4066 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4067 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4068 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4069 /*pstride=*/PStride, /*incr=*/One,
4070 /*chunk=*/CastedChunkSize});
4071
4072 // Load values written by the "init" function.
4073 Value *FirstChunkStart =
4074 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4075 Value *FirstChunkStop =
4076 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4077 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4078 Value *ChunkRange =
4079 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4080 Value *NextChunkStride =
4081 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4082
4083 // Create outer "dispatch" loop for enumerating the chunks.
4084 BasicBlock *DispatchEnter = splitBB(Builder, true);
4085 Value *DispatchCounter;
4087 {Builder.saveIP(), DL},
4088 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
4089 FirstChunkStart, CastedTripCount, NextChunkStride,
4090 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4091 "dispatch");
4092
4093 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4094 // not have to preserve the canonical invariant.
4095 BasicBlock *DispatchBody = DispatchCLI->getBody();
4096 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4097 BasicBlock *DispatchExit = DispatchCLI->getExit();
4098 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4099 DispatchCLI->invalidate();
4100
4101 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4102 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4103 redirectTo(CLI->getExit(), DispatchLatch, DL);
4104 redirectTo(DispatchBody, DispatchEnter, DL);
4105
4106 // Prepare the prolog of the chunk loop.
4109
4110 // Compute the number of iterations of the chunk loop.
4112 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4113 Value *IsLastChunk =
4114 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4115 Value *CountUntilOrigTripCount =
4116 Builder.CreateSub(CastedTripCount, DispatchCounter);
4117 Value *ChunkTripCount = Builder.CreateSelect(
4118 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4119 Value *BackcastedChunkTC =
4120 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4121 CLI->setTripCount(BackcastedChunkTC);
4122
4123 // Update all uses of the induction variable except the one in the condition
4124 // block that compares it with the actual upper bound, and the increment in
4125 // the latch block.
4126 Value *BackcastedDispatchCounter =
4127 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4128 CLI->mapIndVar([&](Instruction *) -> Value * {
4129 Builder.restoreIP(CLI->getBodyIP());
4130 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4131 });
4132
4133 // In the "exit" block, call the "fini" function.
4134 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4135 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4136
4137 // Add the barrier if requested.
4138 if (NeedsBarrier)
4139 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4140 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4141
4142#ifndef NDEBUG
4143 // Even though we currently do not support applying additional methods to it,
4144 // the chunk loop should remain a canonical loop.
4145 CLI->assertOK();
4146#endif
4147
4148 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
4149}
4150
4151// Returns an LLVM function to call for executing an OpenMP static worksharing
4152// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4153// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4154static FunctionCallee
4156 WorksharingLoopType LoopType) {
4157 unsigned Bitwidth = Ty->getIntegerBitWidth();
4158 Module &M = OMPBuilder->M;
4159 switch (LoopType) {
4160 case WorksharingLoopType::ForStaticLoop:
4161 if (Bitwidth == 32)
4162 return OMPBuilder->getOrCreateRuntimeFunction(
4163 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4164 if (Bitwidth == 64)
4165 return OMPBuilder->getOrCreateRuntimeFunction(
4166 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4167 break;
4168 case WorksharingLoopType::DistributeStaticLoop:
4169 if (Bitwidth == 32)
4170 return OMPBuilder->getOrCreateRuntimeFunction(
4171 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4172 if (Bitwidth == 64)
4173 return OMPBuilder->getOrCreateRuntimeFunction(
4174 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4175 break;
4176 case WorksharingLoopType::DistributeForStaticLoop:
4177 if (Bitwidth == 32)
4178 return OMPBuilder->getOrCreateRuntimeFunction(
4179 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4180 if (Bitwidth == 64)
4181 return OMPBuilder->getOrCreateRuntimeFunction(
4182 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4183 break;
4184 }
4185 if (Bitwidth != 32 && Bitwidth != 64) {
4186 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4187 }
4188 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4189}
4190
4191// Inserts a call to proper OpenMP Device RTL function which handles
4192// loop worksharing.
4194 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4195 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4196 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4197 Type *TripCountTy = TripCount->getType();
4198 Module &M = OMPBuilder->M;
4199 IRBuilder<> &Builder = OMPBuilder->Builder;
4200 FunctionCallee RTLFn =
4201 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4202 SmallVector<Value *, 8> RealArgs;
4203 RealArgs.push_back(Ident);
4204 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4205 RealArgs.push_back(LoopBodyArg);
4206 RealArgs.push_back(TripCount);
4207 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4208 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4209 Builder.CreateCall(RTLFn, RealArgs);
4210 return;
4211 }
4212 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4213 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4214 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4215 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4216
4217 RealArgs.push_back(
4218 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4219 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4220 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4221 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4222 }
4223
4224 Builder.CreateCall(RTLFn, RealArgs);
4225}
4226
4227static void
4229 CanonicalLoopInfo *CLI, Value *Ident,
4230 Function &OutlinedFn, Type *ParallelTaskPtr,
4231 const SmallVector<Instruction *, 4> &ToBeDeleted,
4232 WorksharingLoopType LoopType) {
4233 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4234 BasicBlock *Preheader = CLI->getPreheader();
4235 Value *TripCount = CLI->getTripCount();
4236
4237 // After loop body outling, the loop body contains only set up
4238 // of loop body argument structure and the call to the outlined
4239 // loop body function. Firstly, we need to move setup of loop body args
4240 // into loop preheader.
4241 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4242 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4243
4244 // The next step is to remove the whole loop. We do not it need anymore.
4245 // That's why make an unconditional branch from loop preheader to loop
4246 // exit block
4247 Builder.restoreIP({Preheader, Preheader->end()});
4248 Preheader->getTerminator()->eraseFromParent();
4249 Builder.CreateBr(CLI->getExit());
4250
4251 // Delete dead loop blocks
4252 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4253 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4254 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4255 CleanUpInfo.EntryBB = CLI->getHeader();
4256 CleanUpInfo.ExitBB = CLI->getExit();
4257 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4258 DeleteDeadBlocks(BlocksToBeRemoved);
4259
4260 // Find the instruction which corresponds to loop body argument structure
4261 // and remove the call to loop body function instruction.
4262 Value *LoopBodyArg;
4263 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4264 assert(OutlinedFnUser &&
4265 "Expected unique undroppable user of outlined function");
4266 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4267 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4268 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4269 "Expected outlined function call to be located in loop preheader");
4270 // Check in case no argument structure has been passed.
4271 if (OutlinedFnCallInstruction->arg_size() > 1)
4272 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4273 else
4274 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4275 OutlinedFnCallInstruction->eraseFromParent();
4276
4277 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4278 LoopBodyArg, ParallelTaskPtr, TripCount,
4279 OutlinedFn);
4280
4281 for (auto &ToBeDeletedItem : ToBeDeleted)
4282 ToBeDeletedItem->eraseFromParent();
4283 CLI->invalidate();
4284}
4285
4287OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4288 InsertPointTy AllocaIP,
4289 WorksharingLoopType LoopType) {
4290 uint32_t SrcLocStrSize;
4291 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4292 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4293
4294 OutlineInfo OI;
4295 OI.OuterAllocaBB = CLI->getPreheader();
4296 Function *OuterFn = CLI->getPreheader()->getParent();
4297
4298 // Instructions which need to be deleted at the end of code generation
4300
4301 OI.OuterAllocaBB = AllocaIP.getBlock();
4302
4303 // Mark the body loop as region which needs to be extracted
4304 OI.EntryBB = CLI->getBody();
4305 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4306 "omp.prelatch", true);
4307
4308 // Prepare loop body for extraction
4309 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4310
4311 // Insert new loop counter variable which will be used only in loop
4312 // body.
4313 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4314 Instruction *NewLoopCntLoad =
4315 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4316 // New loop counter instructions are redundant in the loop preheader when
4317 // code generation for workshare loop is finshed. That's why mark them as
4318 // ready for deletion.
4319 ToBeDeleted.push_back(NewLoopCntLoad);
4320 ToBeDeleted.push_back(NewLoopCnt);
4321
4322 // Analyse loop body region. Find all input variables which are used inside
4323 // loop body region.
4324 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4326 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4327 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4328 ParallelRegionBlockSet.end());
4329
4330 CodeExtractorAnalysisCache CEAC(*OuterFn);
4331 CodeExtractor Extractor(Blocks,
4332 /* DominatorTree */ nullptr,
4333 /* AggregateArgs */ true,
4334 /* BlockFrequencyInfo */ nullptr,
4335 /* BranchProbabilityInfo */ nullptr,
4336 /* AssumptionCache */ nullptr,
4337 /* AllowVarArgs */ true,
4338 /* AllowAlloca */ true,
4339 /* AllocationBlock */ CLI->getPreheader(),
4340 /* Suffix */ ".omp_wsloop",
4341 /* AggrArgsIn0AddrSpace */ true);
4342
4343 BasicBlock *CommonExit = nullptr;
4344 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4345
4346 // Find allocas outside the loop body region which are used inside loop
4347 // body
4348 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4349
4350 // We need to model loop body region as the function f(cnt, loop_arg).
4351 // That's why we replace loop induction variable by the new counter
4352 // which will be one of loop body function argument
4354 CLI->getIndVar()->user_end());
4355 for (auto Use : Users) {
4356 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4357 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4358 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4359 }
4360 }
4361 }
4362 // Make sure that loop counter variable is not merged into loop body
4363 // function argument structure and it is passed as separate variable
4364 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4365
4366 // PostOutline CB is invoked when loop body function is outlined and
4367 // loop body is replaced by call to outlined function. We need to add
4368 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4369 // function will handle loop control logic.
4370 //
4371 OI.PostOutlineCB = [=, ToBeDeletedVec =
4372 std::move(ToBeDeleted)](Function &OutlinedFn) {
4373 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4374 ToBeDeletedVec, LoopType);
4375 };
4376 addOutlineInfo(std::move(OI));
4377 return CLI->getAfterIP();
4378}
4379
4382 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4383 bool HasSimdModifier, bool HasMonotonicModifier,
4384 bool HasNonmonotonicModifier, bool HasOrderedClause,
4385 WorksharingLoopType LoopType) {
4386 if (Config.isTargetDevice())
4387 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4388 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4389 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4390 HasNonmonotonicModifier, HasOrderedClause);
4391
4392 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4393 OMPScheduleType::ModifierOrdered;
4394 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4395 case OMPScheduleType::BaseStatic:
4396 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4397 if (IsOrdered)
4398 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4399 NeedsBarrier, ChunkSize);
4400 // FIXME: Monotonicity ignored?
4401 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4402
4403 case OMPScheduleType::BaseStaticChunked:
4404 if (IsOrdered)
4405 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4406 NeedsBarrier, ChunkSize);
4407 // FIXME: Monotonicity ignored?
4408 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4409 ChunkSize);
4410
4411 case OMPScheduleType::BaseRuntime:
4412 case OMPScheduleType::BaseAuto:
4413 case OMPScheduleType::BaseGreedy:
4414 case OMPScheduleType::BaseBalanced:
4415 case OMPScheduleType::BaseSteal:
4416 case OMPScheduleType::BaseGuidedSimd:
4417 case OMPScheduleType::BaseRuntimeSimd:
4418 assert(!ChunkSize &&
4419 "schedule type does not support user-defined chunk sizes");
4420 [[fallthrough]];
4421 case OMPScheduleType::BaseDynamicChunked:
4422 case OMPScheduleType::BaseGuidedChunked:
4423 case OMPScheduleType::BaseGuidedIterativeChunked:
4424 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4425 case OMPScheduleType::BaseStaticBalancedChunked:
4426 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4427 NeedsBarrier, ChunkSize);
4428
4429 default:
4430 llvm_unreachable("Unknown/unimplemented schedule kind");
4431 }
4432}
4433
4434/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4435/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4436/// the runtime. Always interpret integers as unsigned similarly to
4437/// CanonicalLoopInfo.
4438static FunctionCallee
4440 unsigned Bitwidth = Ty->getIntegerBitWidth();
4441 if (Bitwidth == 32)
4442 return OMPBuilder.getOrCreateRuntimeFunction(
4443 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4444 if (Bitwidth == 64)
4445 return OMPBuilder.getOrCreateRuntimeFunction(
4446 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4447 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4448}
4449
4450/// Returns an LLVM function to call for updating the next loop using OpenMP
4451/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4452/// the runtime. Always interpret integers as unsigned similarly to
4453/// CanonicalLoopInfo.
4454static FunctionCallee
4456 unsigned Bitwidth = Ty->getIntegerBitWidth();
4457 if (Bitwidth == 32)
4458 return OMPBuilder.getOrCreateRuntimeFunction(
4459 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4460 if (Bitwidth == 64)
4461 return OMPBuilder.getOrCreateRuntimeFunction(
4462 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4463 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4464}
4465
4466/// Returns an LLVM function to call for finalizing the dynamic loop using
4467/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4468/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4469static FunctionCallee
4471 unsigned Bitwidth = Ty->getIntegerBitWidth();
4472 if (Bitwidth == 32)
4473 return OMPBuilder.getOrCreateRuntimeFunction(
4474 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4475 if (Bitwidth == 64)
4476 return OMPBuilder.getOrCreateRuntimeFunction(
4477 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4478 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4479}
4480
4481OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
4482 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4483 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
4484 assert(CLI->isValid() && "Requires a valid canonical loop");
4485 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4486 "Require dedicated allocate IP");
4488 "Require valid schedule type");
4489
4490 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4491 OMPScheduleType::ModifierOrdered;
4492
4493 // Set up the source location value for OpenMP runtime.
4495
4496 uint32_t SrcLocStrSize;
4497 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4498 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4499
4500 // Declare useful OpenMP runtime functions.
4501 Value *IV = CLI->getIndVar();
4502 Type *IVTy = IV->getType();
4503 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4504 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4505
4506 // Allocate space for computed loop bounds as expected by the "init" function.
4507 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4508 Type *I32Type = Type::getInt32Ty(M.getContext());
4509 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4510 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4511 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4512 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4513
4514 // At the end of the preheader, prepare for calling the "init" function by
4515 // storing the current loop bounds into the allocated space. A canonical loop
4516 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4517 // and produces an inclusive upper bound.
4518 BasicBlock *PreHeader = CLI->getPreheader();
4519 Builder.SetInsertPoint(PreHeader->getTerminator());
4520 Constant *One = ConstantInt::get(IVTy, 1);
4521 Builder.CreateStore(One, PLowerBound);
4522 Value *UpperBound = CLI->getTripCount();
4523 Builder.CreateStore(UpperBound, PUpperBound);
4524 Builder.CreateStore(One, PStride);
4525
4526 BasicBlock *Header = CLI->getHeader();
4527 BasicBlock *Exit = CLI->getExit();
4528 BasicBlock *Cond = CLI->getCond();
4529 BasicBlock *Latch = CLI->getLatch();
4530 InsertPointTy AfterIP = CLI->getAfterIP();
4531
4532 // The CLI will be "broken" in the code below, as the loop is no longer
4533 // a valid canonical loop.
4534
4535 if (!Chunk)
4536 Chunk = One;
4537
4538 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4539
4540 Constant *SchedulingType =
4541 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4542
4543 // Call the "init" function.
4544 Builder.CreateCall(DynamicInit,
4545 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4546 UpperBound, /* step */ One, Chunk});
4547
4548 // An outer loop around the existing one.
4549 BasicBlock *OuterCond = BasicBlock::Create(
4550 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4551 PreHeader->getParent());
4552 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4553 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4554 Value *Res =
4555 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4556 PLowerBound, PUpperBound, PStride});
4557 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4558 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4559 Value *LowerBound =
4560 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4561 Builder.CreateCondBr(MoreWork, Header, Exit);
4562
4563 // Change PHI-node in loop header to use outer cond rather than preheader,
4564 // and set IV to the LowerBound.
4565 Instruction *Phi = &Header->front();
4566 auto *PI = cast<PHINode>(Phi);
4567 PI->setIncomingBlock(0, OuterCond);
4568 PI->setIncomingValue(0, LowerBound);
4569
4570 // Then set the pre-header to jump to the OuterCond
4571 Instruction *Term = PreHeader->getTerminator();
4572 auto *Br = cast<BranchInst>(Term);
4573 Br->setSuccessor(0, OuterCond);
4574
4575 // Modify the inner condition:
4576 // * Use the UpperBound returned from the DynamicNext call.
4577 // * jump to the loop outer loop when done with one of the inner loops.
4578 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4579 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4581 auto *CI = cast<CmpInst>(Comp);
4582 CI->setOperand(1, UpperBound);
4583 // Redirect the inner exit to branch to outer condition.
4584 Instruction *Branch = &Cond->back();
4585 auto *BI = cast<BranchInst>(Branch);
4586 assert(BI->getSuccessor(1) == Exit);
4587 BI->setSuccessor(1, OuterCond);
4588
4589 // Call the "fini" function if "ordered" is present in wsloop directive.
4590 if (Ordered) {
4591 Builder.SetInsertPoint(&Latch->back());
4592 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4593 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4594 }
4595
4596 // Add the barrier if requested.
4597 if (NeedsBarrier) {
4598 Builder.SetInsertPoint(&Exit->back());
4599 createBarrier(LocationDescription(Builder.saveIP(), DL),
4600 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4601 /* CheckCancelFlag */ false);
4602 }
4603
4604 CLI->invalidate();
4605 return AfterIP;
4606}
4607
4608/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4609/// after this \p OldTarget will be orphaned.
4611 BasicBlock *NewTarget, DebugLoc DL) {
4612 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4613 redirectTo(Pred, NewTarget, DL);
4614}
4615
4616/// Determine which blocks in \p BBs are reachable from outside and remove the
4617/// ones that are not reachable from the function.
4619 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4620 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4621 for (Use &U : BB->uses()) {
4622 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4623 if (!UseInst)
4624 continue;
4625 if (BBsToErase.count(UseInst->getParent()))
4626 continue;
4627 return true;
4628 }
4629 return false;
4630 };
4631
4632 while (BBsToErase.remove_if(HasRemainingUses)) {
4633 // Try again if anything was removed.
4634 }
4635
4636 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4637 DeleteDeadBlocks(BBVec);
4638}
4639
4642 InsertPointTy ComputeIP) {
4643 assert(Loops.size() >= 1 && "At least one loop required");
4644 size_t NumLoops = Loops.size();
4645
4646 // Nothing to do if there is already just one loop.
4647 if (NumLoops == 1)
4648 return Loops.front();
4649
4650 CanonicalLoopInfo *Outermost = Loops.front();
4651 CanonicalLoopInfo *Innermost = Loops.back();
4652 BasicBlock *OrigPreheader = Outermost->getPreheader();
4653 BasicBlock *OrigAfter = Outermost->getAfter();
4654 Function *F = OrigPreheader->getParent();
4655
4656 // Loop control blocks that may become orphaned later.
4657 SmallVector<BasicBlock *, 12> OldControlBBs;
4658 OldControlBBs.reserve(6 * Loops.size());
4660 Loop->collectControlBlocks(OldControlBBs);
4661
4662 // Setup the IRBuilder for inserting the trip count computation.
4664 if (ComputeIP.isSet())
4665 Builder.restoreIP(ComputeIP);
4666 else
4667 Builder.restoreIP(Outermost->getPreheaderIP());
4668
4669 // Derive the collapsed' loop trip count.
4670 // TODO: Find common/largest indvar type.
4671 Value *CollapsedTripCount = nullptr;
4672 for (CanonicalLoopInfo *L : Loops) {
4673 assert(L->isValid() &&
4674 "All loops to collapse must be valid canonical loops");
4675 Value *OrigTripCount = L->getTripCount();
4676 if (!CollapsedTripCount) {
4677 CollapsedTripCount = OrigTripCount;
4678 continue;
4679 }
4680
4681 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4682 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4683 {}, /*HasNUW=*/true);
4684 }
4685
4686 // Create the collapsed loop control flow.
4687 CanonicalLoopInfo *Result =
4688 createLoopSkeleton(DL, CollapsedTripCount, F,
4689 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4690
4691 // Build the collapsed loop body code.
4692 // Start with deriving the input loop induction variables from the collapsed
4693 // one, using a divmod scheme. To preserve the original loops' order, the
4694 // innermost loop use the least significant bits.
4695 Builder.restoreIP(Result->getBodyIP());
4696
4697 Value *Leftover = Result->getIndVar();
4698 SmallVector<Value *> NewIndVars;
4699 NewIndVars.resize(NumLoops);
4700 for (int i = NumLoops - 1; i >= 1; --i) {
4701 Value *OrigTripCount = Loops[i]->getTripCount();
4702
4703 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4704 NewIndVars[i] = NewIndVar;
4705
4706 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4707 }
4708 // Outermost loop gets all the remaining bits.
4709 NewIndVars[0] = Leftover;
4710
4711 // Construct the loop body control flow.
4712 // We progressively construct the branch structure following in direction of
4713 // the control flow, from the leading in-between code, the loop nest body, the
4714 // trailing in-between code, and rejoining the collapsed loop's latch.
4715 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4716 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4717 // its predecessors as sources.
4718 BasicBlock *ContinueBlock = Result->getBody();
4719 BasicBlock *ContinuePred = nullptr;
4720 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4721 BasicBlock *NextSrc) {
4722 if (ContinueBlock)
4723 redirectTo(ContinueBlock, Dest, DL);
4724 else
4725 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4726
4727 ContinueBlock = nullptr;
4728 ContinuePred = NextSrc;
4729 };
4730
4731 // The code before the nested loop of each level.
4732 // Because we are sinking it into the nest, it will be executed more often
4733 // that the original loop. More sophisticated schemes could keep track of what
4734 // the in-between code is and instantiate it only once per thread.
4735 for (size_t i = 0; i < NumLoops - 1; ++i)
4736 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4737
4738 // Connect the loop nest body.
4739 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4740
4741 // The code after the nested loop at each level.
4742 for (size_t i = NumLoops - 1; i > 0; --i)
4743 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4744
4745 // Connect the finished loop to the collapsed loop latch.
4746 ContinueWith(Result->getLatch(), nullptr);
4747
4748 // Replace the input loops with the new collapsed loop.
4749 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4750 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4751
4752 // Replace the input loop indvars with the derived ones.
4753 for (size_t i = 0; i < NumLoops; ++i)
4754 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4755
4756 // Remove unused parts of the input loops.
4757 removeUnusedBlocksFromParent(OldControlBBs);
4758
4759 for (CanonicalLoopInfo *L : Loops)
4760 L->invalidate();
4761
4762#ifndef NDEBUG
4763 Result->assertOK();
4764#endif
4765 return Result;
4766}
4767
4768std::vector<CanonicalLoopInfo *>
4770 ArrayRef<Value *> TileSizes) {
4771 assert(TileSizes.size() == Loops.size() &&
4772 "Must pass as many tile sizes as there are loops");
4773 int NumLoops = Loops.size();
4774 assert(NumLoops >= 1 && "At least one loop to tile required");
4775
4776 CanonicalLoopInfo *OutermostLoop = Loops.front();
4777 CanonicalLoopInfo *InnermostLoop = Loops.back();
4778 Function *F = OutermostLoop->getBody()->getParent();
4779 BasicBlock *InnerEnter = InnermostLoop->getBody();
4780 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4781
4782 // Loop control blocks that may become orphaned later.
4783 SmallVector<BasicBlock *, 12> OldControlBBs;
4784 OldControlBBs.reserve(6 * Loops.size());
4786 Loop->collectControlBlocks(OldControlBBs);
4787
4788 // Collect original trip counts and induction variable to be accessible by
4789 // index. Also, the structure of the original loops is not preserved during
4790 // the construction of the tiled loops, so do it before we scavenge the BBs of
4791 // any original CanonicalLoopInfo.
4792 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4793 for (CanonicalLoopInfo *L : Loops) {
4794 assert(L->isValid() && "All input loops must be valid canonical loops");
4795 OrigTripCounts.push_back(L->getTripCount());
4796 OrigIndVars.push_back(L->getIndVar());
4797 }
4798
4799 // Collect the code between loop headers. These may contain SSA definitions
4800 // that are used in the loop nest body. To be usable with in the innermost
4801 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4802 // these instructions may be executed more often than before the tiling.
4803 // TODO: It would be sufficient to only sink them into body of the
4804 // corresponding tile loop.
4806 for (int i = 0; i < NumLoops - 1; ++i) {
4807 CanonicalLoopInfo *Surrounding = Loops[i];
4808 CanonicalLoopInfo *Nested = Loops[i + 1];
4809
4810 BasicBlock *EnterBB = Surrounding->getBody();
4811 BasicBlock *ExitBB = Nested->getHeader();
4812 InbetweenCode.emplace_back(EnterBB, ExitBB);
4813 }
4814
4815 // Compute the trip counts of the floor loops.
4817 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4818 SmallVector<Value *, 4> FloorCount, FloorRems;
4819 for (int i = 0; i < NumLoops; ++i) {
4820 Value *TileSize = TileSizes[i];
4821 Value *OrigTripCount = OrigTripCounts[i];
4822 Type *IVType = OrigTripCount->getType();
4823
4824 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
4825 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
4826
4827 // 0 if tripcount divides the tilesize, 1 otherwise.
4828 // 1 means we need an additional iteration for a partial tile.
4829 //
4830 // Unfortunately we cannot just use the roundup-formula
4831 // (tripcount + tilesize - 1)/tilesize
4832 // because the summation might overflow. We do not want introduce undefined
4833 // behavior when the untiled loop nest did not.
4834 Value *FloorTripOverflow =
4835 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
4836
4837 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
4838 FloorTripCount =
4839 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
4840 "omp_floor" + Twine(i) + ".tripcount", true);
4841
4842 // Remember some values for later use.
4843 FloorCount.push_back(FloorTripCount);
4844 FloorRems.push_back(FloorTripRem);
4845 }
4846
4847 // Generate the new loop nest, from the outermost to the innermost.
4848 std::vector<CanonicalLoopInfo *> Result;
4849 Result.reserve(NumLoops * 2);
4850
4851 // The basic block of the surrounding loop that enters the nest generated
4852 // loop.
4853 BasicBlock *Enter = OutermostLoop->getPreheader();
4854
4855 // The basic block of the surrounding loop where the inner code should
4856 // continue.
4857 BasicBlock *Continue = OutermostLoop->getAfter();
4858
4859 // Where the next loop basic block should be inserted.
4860 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
4861
4862 auto EmbeddNewLoop =
4863 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
4864 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
4865 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
4866 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
4867 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
4868 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
4869
4870 // Setup the position where the next embedded loop connects to this loop.
4871 Enter = EmbeddedLoop->getBody();
4872 Continue = EmbeddedLoop->getLatch();
4873 OutroInsertBefore = EmbeddedLoop->getLatch();
4874 return EmbeddedLoop;
4875 };
4876
4877 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
4878 const Twine &NameBase) {
4879 for (auto P : enumerate(TripCounts)) {
4880 CanonicalLoopInfo *EmbeddedLoop =
4881 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
4882 Result.push_back(EmbeddedLoop);
4883 }
4884 };
4885
4886 EmbeddNewLoops(FloorCount, "floor");
4887
4888 // Within the innermost floor loop, emit the code that computes the tile
4889 // sizes.
4891 SmallVector<Value *, 4> TileCounts;
4892 for (int i = 0; i < NumLoops; ++i) {
4893 CanonicalLoopInfo *FloorLoop = Result[i];
4894 Value *TileSize = TileSizes[i];
4895
4896 Value *FloorIsEpilogue =
4897 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
4898 Value *TileTripCount =
4899 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
4900
4901 TileCounts.push_back(TileTripCount);
4902 }
4903
4904 // Create the tile loops.
4905 EmbeddNewLoops(TileCounts, "tile");
4906
4907 // Insert the inbetween code into the body.
4908 BasicBlock *BodyEnter = Enter;
4909 BasicBlock *BodyEntered = nullptr;
4910 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
4911 BasicBlock *EnterBB = P.first;
4912 BasicBlock *ExitBB = P.second;
4913
4914 if (BodyEnter)
4915 redirectTo(BodyEnter, EnterBB, DL);
4916 else
4917 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
4918
4919 BodyEnter = nullptr;
4920 BodyEntered = ExitBB;
4921 }
4922
4923 // Append the original loop nest body into the generated loop nest body.
4924 if (BodyEnter)
4925 redirectTo(BodyEnter, InnerEnter, DL);
4926 else
4927 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
4929
4930 // Replace the original induction variable with an induction variable computed
4931 // from the tile and floor induction variables.
4932 Builder.restoreIP(Result.back()->getBodyIP());
4933 for (int i = 0; i < NumLoops; ++i) {
4934 CanonicalLoopInfo *FloorLoop = Result[i];
4935 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
4936 Value *OrigIndVar = OrigIndVars[i];
4937 Value *Size = TileSizes[i];
4938
4939 Value *Scale =
4940 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
4941 Value *Shift =
4942 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
4943 OrigIndVar->replaceAllUsesWith(Shift);
4944 }
4945
4946 // Remove unused parts of the original loops.
4947 removeUnusedBlocksFromParent(OldControlBBs);
4948
4949 for (CanonicalLoopInfo *L : Loops)
4950 L->invalidate();
4951
4952#ifndef NDEBUG
4953 for (CanonicalLoopInfo *GenL : Result)
4954 GenL->assertOK();
4955#endif
4956 return Result;
4957}
4958
4959/// Attach metadata \p Properties to the basic block described by \p BB. If the
4960/// basic block already has metadata, the basic block properties are appended.
4962 ArrayRef<Metadata *> Properties) {
4963 // Nothing to do if no property to attach.
4964 if (Properties.empty())
4965 return;
4966
4967 LLVMContext &Ctx = BB->getContext();
4968 SmallVector<Metadata *> NewProperties;
4969 NewProperties.push_back(nullptr);
4970
4971 // If the basic block already has metadata, prepend it to the new metadata.
4972 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
4973 if (Existing)
4974 append_range(NewProperties, drop_begin(Existing->operands(), 1));
4975
4976 append_range(NewProperties, Properties);
4977 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
4978 BasicBlockID->replaceOperandWith(0, BasicBlockID);
4979
4980 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
4981}
4982
4983/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
4984/// loop already has metadata, the loop properties are appended.
4986 ArrayRef<Metadata *> Properties) {
4987 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
4988
4989 // Attach metadata to the loop's latch
4990 BasicBlock *Latch = Loop->getLatch();
4991 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
4992 addBasicBlockMetadata(Latch, Properties);
4993}
4994
4995/// Attach llvm.access.group metadata to the memref instructions of \p Block
4996static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
4997 LoopInfo &LI) {
4998 for (Instruction &I : *Block) {
4999 if (I.mayReadOrWriteMemory()) {
5000 // TODO: This instruction may already have access group from
5001 // other pragmas e.g. #pragma clang loop vectorize. Append
5002 // so that the existing metadata is not overwritten.
5003 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5004 }
5005 }
5006}
5007
5011 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5012 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5013}
5014
5018 Loop, {
5019 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5020 });
5021}
5022
5023void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5024 Value *IfCond, ValueToValueMapTy &VMap,
5025 const Twine &NamePrefix) {
5026 Function *F = CanonicalLoop->getFunction();
5027
5028 // Define where if branch should be inserted
5029 Instruction *SplitBefore;
5030 if (Instruction::classof(IfCond)) {
5031 SplitBefore = dyn_cast<Instruction>(IfCond);
5032 } else {
5033 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5034 }
5035
5036 // TODO: We should not rely on pass manager. Currently we use pass manager
5037 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5038 // object. We should have a method which returns all blocks between
5039 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5041 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5042 FAM.registerPass([]() { return LoopAnalysis(); });
5043 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5044
5045 // Get the loop which needs to be cloned
5046 LoopAnalysis LIA;
5047 LoopInfo &&LI = LIA.run(*F, FAM);
5048 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5049
5050 // Create additional blocks for the if statement
5051 BasicBlock *Head = SplitBefore->getParent();
5052 Instruction *HeadOldTerm = Head->getTerminator();
5053 llvm::LLVMContext &C = Head->getContext();
5055 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5057 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5058
5059 // Create if condition branch.
5060 Builder.SetInsertPoint(HeadOldTerm);
5061 Instruction *BrInstr =
5062 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5063 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5064 // Then block contains branch to omp loop which needs to be vectorized
5065 spliceBB(IP, ThenBlock, false);
5066 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5067
5068 Builder.SetInsertPoint(ElseBlock);
5069
5070 // Clone loop for the else branch
5072
5073 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5074 for (BasicBlock *Block : L->getBlocks()) {
5075 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5076 NewBB->moveBefore(CanonicalLoop->getExit());
5077 VMap[Block] = NewBB;
5078 NewBlocks.push_back(NewBB);
5079 }
5080 remapInstructionsInBlocks(NewBlocks, VMap);
5081 Builder.CreateBr(NewBlocks.front());
5082}
5083
5084unsigned
5086 const StringMap<bool> &Features) {
5087 if (TargetTriple.isX86()) {
5088 if (Features.lookup("avx512f"))
5089 return 512;
5090 else if (Features.lookup("avx"))
5091 return 256;
5092 return 128;
5093 }
5094 if (TargetTriple.isPPC())
5095 return 128;
5096 if (TargetTriple.isWasm())
5097 return 128;
5098 return 0;
5099}
5100
5102 MapVector<Value *, Value *> AlignedVars,
5103 Value *IfCond, OrderKind Order,
5104 ConstantInt *Simdlen, ConstantInt *Safelen) {
5106
5107 Function *F = CanonicalLoop->getFunction();
5108
5109 // TODO: We should not rely on pass manager. Currently we use pass manager
5110 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5111 // object. We should have a method which returns all blocks between
5112 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5114 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5115 FAM.registerPass([]() { return LoopAnalysis(); });
5116 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5117
5118 LoopAnalysis LIA;
5119 LoopInfo &&LI = LIA.run(*F, FAM);
5120
5121 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5122 if (AlignedVars.size()) {
5124 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
5125 for (auto &AlignedItem : AlignedVars) {
5126 Value *AlignedPtr = AlignedItem.first;
5127 Value *Alignment = AlignedItem.second;
5128 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5129 AlignedPtr, Alignment);
5130 }
5131 Builder.restoreIP(IP);
5132 }
5133
5134 if (IfCond) {
5135 ValueToValueMapTy VMap;
5136 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5137 // Add metadata to the cloned loop which disables vectorization
5138 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5139 assert(MappedLatch &&
5140 "Cannot find value which corresponds to original loop latch");
5141 assert(isa<BasicBlock>(MappedLatch) &&
5142 "Cannot cast mapped latch block value to BasicBlock");
5143 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5144 ConstantAsMetadata *BoolConst =
5147 NewLatchBlock,
5148 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5149 BoolConst})});
5150 }
5151
5152 SmallSet<BasicBlock *, 8> Reachable;
5153
5154 // Get the basic blocks from the loop in which memref instructions
5155 // can be found.
5156 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5157 // preferably without running any passes.
5158 for (BasicBlock *Block : L->getBlocks()) {
5159 if (Block == CanonicalLoop->getCond() ||
5160 Block == CanonicalLoop->getHeader())
5161 continue;
5162 Reachable.insert(Block);
5163 }
5164
5165 SmallVector<Metadata *> LoopMDList;
5166
5167 // In presence of finite 'safelen', it may be unsafe to mark all
5168 // the memory instructions parallel, because loop-carried
5169 // dependences of 'safelen' iterations are possible.
5170 // If clause order(concurrent) is specified then the memory instructions
5171 // are marked parallel even if 'safelen' is finite.
5172 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5173 // Add access group metadata to memory-access instructions.
5174 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5175 for (BasicBlock *BB : Reachable)
5176 addSimdMetadata(BB, AccessGroup, LI);
5177 // TODO: If the loop has existing parallel access metadata, have
5178 // to combine two lists.
5179 LoopMDList.push_back(MDNode::get(
5180 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5181 }
5182
5183 // Use the above access group metadata to create loop level
5184 // metadata, which should be distinct for each loop.
5185 ConstantAsMetadata *BoolConst =
5187 LoopMDList.push_back(MDNode::get(
5188 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5189
5190 if (Simdlen || Safelen) {
5191 // If both simdlen and safelen clauses are specified, the value of the
5192 // simdlen parameter must be less than or equal to the value of the safelen
5193 // parameter. Therefore, use safelen only in the absence of simdlen.
5194 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5195 LoopMDList.push_back(
5196 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5197 ConstantAsMetadata::get(VectorizeWidth)}));
5198 }
5199
5200 addLoopMetadata(CanonicalLoop, LoopMDList);
5201}
5202
5203/// Create the TargetMachine object to query the backend for optimization
5204/// preferences.
5205///
5206/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5207/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5208/// needed for the LLVM pass pipline. We use some default options to avoid
5209/// having to pass too many settings from the frontend that probably do not
5210/// matter.
5211///
5212/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5213/// method. If we are going to use TargetMachine for more purposes, especially
5214/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5215/// might become be worth requiring front-ends to pass on their TargetMachine,
5216/// or at least cache it between methods. Note that while fontends such as Clang
5217/// have just a single main TargetMachine per translation unit, "target-cpu" and
5218/// "target-features" that determine the TargetMachine are per-function and can
5219/// be overrided using __attribute__((target("OPTIONS"))).
5220static std::unique_ptr<TargetMachine>
5222 Module *M = F->getParent();
5223
5224 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5225 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5226 const std::string &Triple = M->getTargetTriple();
5227
5228 std::string Error;
5230 if (!TheTarget)
5231 return {};
5232
5234 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5235 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5236 /*CodeModel=*/std::nullopt, OptLevel));
5237}
5238
5239/// Heuristically determine the best-performant unroll factor for \p CLI. This
5240/// depends on the target processor. We are re-using the same heuristics as the
5241/// LoopUnrollPass.
5243 Function *F = CLI->getFunction();
5244
5245 // Assume the user requests the most aggressive unrolling, even if the rest of
5246 // the code is optimized using a lower setting.
5248 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5249
5251 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5252 FAM.registerPass([]() { return AssumptionAnalysis(); });
5253 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5254 FAM.registerPass([]() { return LoopAnalysis(); });
5255 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5256 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5257 TargetIRAnalysis TIRA;
5258 if (TM)
5259 TIRA = TargetIRAnalysis(
5260 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5261 FAM.registerPass([&]() { return TIRA; });
5262
5263 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5265 ScalarEvolution &&SE = SEA.run(*F, FAM);
5267 DominatorTree &&DT = DTA.run(*F, FAM);
5268 LoopAnalysis LIA;
5269 LoopInfo &&LI = LIA.run(*F, FAM);
5271 AssumptionCache &&AC = ACT.run(*F, FAM);
5273
5274 Loop *L = LI.getLoopFor(CLI->getHeader());
5275 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5276
5279 /*BlockFrequencyInfo=*/nullptr,
5280 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5281 /*UserThreshold=*/std::nullopt,
5282 /*UserCount=*/std::nullopt,
5283 /*UserAllowPartial=*/true,
5284 /*UserAllowRuntime=*/true,
5285 /*UserUpperBound=*/std::nullopt,
5286 /*UserFullUnrollMaxCount=*/std::nullopt);
5287
5288 UP.Force = true;
5289
5290 // Account for additional optimizations taking place before the LoopUnrollPass
5291 // would unroll the loop.
5294
5295 // Use normal unroll factors even if the rest of the code is optimized for
5296 // size.
5299
5300 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5301 << " Threshold=" << UP.Threshold << "\n"
5302 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5303 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5304 << " PartialOptSizeThreshold="
5305 << UP.PartialOptSizeThreshold << "\n");
5306
5307 // Disable peeling.
5310 /*UserAllowPeeling=*/false,
5311 /*UserAllowProfileBasedPeeling=*/false,
5312 /*UnrollingSpecficValues=*/false);
5313
5315 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5316
5317 // Assume that reads and writes to stack variables can be eliminated by
5318 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5319 // size.
5320 for (BasicBlock *BB : L->blocks()) {
5321 for (Instruction &I : *BB) {
5322 Value *Ptr;
5323 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5324 Ptr = Load->getPointerOperand();
5325 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5326 Ptr = Store->getPointerOperand();
5327 } else
5328 continue;
5329
5330 Ptr = Ptr->stripPointerCasts();
5331
5332 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5333 if (Alloca->getParent() == &F->getEntryBlock())
5334 EphValues.insert(&I);
5335 }
5336 }
5337 }
5338
5339 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5340
5341 // Loop is not unrollable if the loop contains certain instructions.
5342 if (!UCE.canUnroll()) {
5343 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5344 return 1;
5345 }
5346
5347 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5348 << "\n");
5349
5350 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5351 // be able to use it.
5352 int TripCount = 0;
5353 int MaxTripCount = 0;
5354 bool MaxOrZero = false;
5355 unsigned TripMultiple = 0;
5356
5357 bool UseUpperBound = false;
5358 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5359 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5360 UseUpperBound);
5361 unsigned Factor = UP.Count;
5362 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5363
5364 // This function returns 1 to signal to not unroll a loop.
5365 if (Factor == 0)
5366 return 1;
5367 return Factor;
5368}
5369
5371 int32_t Factor,
5372 CanonicalLoopInfo **UnrolledCLI) {
5373 assert(Factor >= 0 && "Unroll factor must not be negative");
5374
5375 Function *F = Loop->getFunction();
5376 LLVMContext &Ctx = F->getContext();
5377
5378 // If the unrolled loop is not used for another loop-associated directive, it
5379 // is sufficient to add metadata for the LoopUnrollPass.
5380 if (!UnrolledCLI) {
5381 SmallVector<Metadata *, 2> LoopMetadata;
5382 LoopMetadata.push_back(
5383 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5384
5385 if (Factor >= 1) {
5387 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5388 LoopMetadata.push_back(MDNode::get(
5389 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5390 }
5391
5392 addLoopMetadata(Loop, LoopMetadata);
5393 return;
5394 }
5395
5396 // Heuristically determine the unroll factor.
5397 if (Factor == 0)
5399
5400 // No change required with unroll factor 1.
5401 if (Factor == 1) {
5402 *UnrolledCLI = Loop;
5403 return;
5404 }
5405
5406 assert(Factor >= 2 &&
5407 "unrolling only makes sense with a factor of 2 or larger");
5408
5409 Type *IndVarTy = Loop->getIndVarType();
5410
5411 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5412 // unroll the inner loop.
5413 Value *FactorVal =
5414 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5415 /*isSigned=*/false));
5416 std::vector<CanonicalLoopInfo *> LoopNest =
5417 tileLoops(DL, {Loop}, {FactorVal});
5418 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5419 *UnrolledCLI = LoopNest[0];
5420 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5421
5422 // LoopUnrollPass can only fully unroll loops with constant trip count.
5423 // Unroll by the unroll factor with a fallback epilog for the remainder
5424 // iterations if necessary.
5426 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5428 InnerLoop,
5429 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5431 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5432
5433#ifndef NDEBUG
5434 (*UnrolledCLI)->assertOK();
5435#endif
5436}
5437
5440 llvm::Value *BufSize, llvm::Value *CpyBuf,
5441 llvm::Value *CpyFn, llvm::Value *DidIt) {
5442 if (!updateToLocation(Loc))
5443 return Loc.IP;
5444
5445 uint32_t SrcLocStrSize;
5446 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5447 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5448 Value *ThreadId = getOrCreateThreadID(Ident);
5449
5450 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5451
5452 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5453
5454 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5455 Builder.CreateCall(Fn, Args);
5456
5457 return Builder.saveIP();
5458}
5459
5461 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5462 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5464
5465 if (!updateToLocation(Loc))
5466 return Loc.IP;
5467
5468 // If needed allocate and initialize `DidIt` with 0.
5469 // DidIt: flag variable: 1=single thread; 0=not single thread.
5470 llvm::Value *DidIt = nullptr;
5471 if (!CPVars.empty()) {
5474 }
5475
5476 Directive OMPD = Directive::OMPD_single;
5477 uint32_t SrcLocStrSize;
5478 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5479 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5480 Value *ThreadId = getOrCreateThreadID(Ident);
5481 Value *Args[] = {Ident, ThreadId};
5482
5483 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5484 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5485
5486 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5487 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5488
5489 auto FiniCBWrapper = [&](InsertPointTy IP) {
5490 FiniCB(IP);
5491
5492 // The thread that executes the single region must set `DidIt` to 1.
5493 // This is used by __kmpc_copyprivate, to know if the caller is the
5494 // single thread or not.
5495 if (DidIt)
5497 };
5498
5499 // generates the following:
5500 // if (__kmpc_single()) {
5501 // .... single region ...
5502 // __kmpc_end_single
5503 // }
5504 // __kmpc_copyprivate
5505 // __kmpc_barrier
5506
5507 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5508 /*Conditional*/ true,
5509 /*hasFinalize*/ true);
5510
5511 if (DidIt) {
5512 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5513 // NOTE BufSize is currently unused, so just pass 0.
5515 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5516 CPFuncs[I], DidIt);
5517 // NOTE __kmpc_copyprivate already inserts a barrier
5518 } else if (!IsNowait)
5520 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5521 /* CheckCancelFlag */ false);
5522 return Builder.saveIP();
5523}
5524
5526 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5527 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5528
5529 if (!updateToLocation(Loc))
5530 return Loc.IP;
5531
5532 Directive OMPD = Directive::OMPD_critical;
5533 uint32_t SrcLocStrSize;
5534 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5535 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5536 Value *ThreadId = getOrCreateThreadID(Ident);
5537 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5538 Value *Args[] = {Ident, ThreadId, LockVar};
5539
5540 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5541 Function *RTFn = nullptr;
5542 if (HintInst) {
5543 // Add Hint to entry Args and create call
5544 EnterArgs.push_back(HintInst);
5545 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5546 } else {
5547 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5548 }
5549 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5550
5551 Function *ExitRTLFn =
5552 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5553 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5554
5555 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5556 /*Conditional*/ false, /*hasFinalize*/ true);
5557}
5558
5561 InsertPointTy AllocaIP, unsigned NumLoops,
5562 ArrayRef<llvm::Value *> StoreValues,
5563 const Twine &Name, bool IsDependSource) {
5564 assert(
5565 llvm::all_of(StoreValues,
5566 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5567 "OpenMP runtime requires depend vec with i64 type");
5568
5569 if (!updateToLocation(Loc))
5570 return Loc.IP;
5571
5572 // Allocate space for vector and generate alloc instruction.
5573 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5574 Builder.restoreIP(AllocaIP);
5575 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5576 ArgsBase->setAlignment(Align(8));
5577 Builder.restoreIP(Loc.IP);
5578
5579 // Store the index value with offset in depend vector.
5580 for (unsigned I = 0; I < NumLoops; ++I) {
5581 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5582 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5583 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5584 STInst->setAlignment(Align(8));
5585 }
5586
5587 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5588 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5589
5590 uint32_t SrcLocStrSize;
5591 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5592 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5593 Value *ThreadId = getOrCreateThreadID(Ident);
5594 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5595
5596 Function *RTLFn = nullptr;
5597 if (IsDependSource)
5598 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5599 else
5600 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5601 Builder.CreateCall(RTLFn, Args);
5602
5603 return Builder.saveIP();
5604}
5605
5607 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5608 FinalizeCallbackTy FiniCB, bool IsThreads) {
5609 if (!updateToLocation(Loc))
5610 return Loc.IP;
5611
5612 Directive OMPD = Directive::OMPD_ordered;
5613 Instruction *EntryCall = nullptr;
5614 Instruction *ExitCall = nullptr;
5615
5616 if (IsThreads) {
5617 uint32_t SrcLocStrSize;
5618 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5619 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5620 Value *ThreadId = getOrCreateThreadID(Ident);
5621 Value *Args[] = {Ident, ThreadId};
5622
5623 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5624 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5625
5626 Function *ExitRTLFn =
5627 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5628 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5629 }
5630
5631 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5632 /*Conditional*/ false, /*hasFinalize*/ true);
5633}
5634
5635OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5636 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5637 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5638 bool HasFinalize, bool IsCancellable) {
5639
5640 if (HasFinalize)
5641 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5642
5643 // Create inlined region's entry and body blocks, in preparation
5644 // for conditional creation
5645 BasicBlock *EntryBB = Builder.GetInsertBlock();
5646 Instruction *SplitPos = EntryBB->getTerminator();
5647 if (!isa_and_nonnull<BranchInst>(SplitPos))
5648 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5649 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5650 BasicBlock *FiniBB =
5651 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5652
5654 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5655
5656 // generate body
5657 BodyGenCB(/* AllocaIP */ InsertPointTy(),
5658 /* CodeGenIP */ Builder.saveIP());
5659
5660 // emit exit call and do any needed finalization.
5661 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5662 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5663 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5664 "Unexpected control flow graph state!!");
5665 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5666 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5667 "Unexpected Control Flow State!");
5669
5670 // If we are skipping the region of a non conditional, remove the exit
5671 // block, and clear the builder's insertion point.
5672 assert(SplitPos->getParent() == ExitBB &&
5673 "Unexpected Insertion point location!");
5674 auto merged = MergeBlockIntoPredecessor(ExitBB);
5675 BasicBlock *ExitPredBB = SplitPos->getParent();
5676 auto InsertBB = merged ? ExitPredBB : ExitBB;
5677 if (!isa_and_nonnull<BranchInst>(SplitPos))
5678 SplitPos->eraseFromParent();
5679 Builder.SetInsertPoint(InsertBB);
5680
5681 return Builder.saveIP();
5682}
5683
5684OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5685 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5686 // if nothing to do, Return current insertion point.
5687 if (!Conditional || !EntryCall)
5688 return Builder.saveIP();
5689
5690 BasicBlock *EntryBB = Builder.GetInsertBlock();
5691 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5692 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5693 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5694
5695 // Emit thenBB and set the Builder's insertion point there for
5696 // body generation next. Place the block after the current block.
5697 Function *CurFn = EntryBB->getParent();
5698 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5699
5700 // Move Entry branch to end of ThenBB, and replace with conditional
5701 // branch (If-stmt)
5702 Instruction *EntryBBTI = EntryBB->getTerminator();
5703 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5704 EntryBBTI->removeFromParent();
5706 Builder.Insert(EntryBBTI);
5707 UI->eraseFromParent();
5709
5710 // return an insertion point to ExitBB.
5711 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5712}
5713
5714OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
5715 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5716 bool HasFinalize) {
5717
5718 Builder.restoreIP(FinIP);
5719
5720 // If there is finalization to do, emit it before the exit call
5721 if (HasFinalize) {
5722 assert(!FinalizationStack.empty() &&
5723 "Unexpected finalization stack state!");
5724
5725 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5726 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5727
5728 Fi.FiniCB(FinIP);
5729
5730 BasicBlock *FiniBB = FinIP.getBlock();
5731 Instruction *FiniBBTI = FiniBB->getTerminator();
5732
5733 // set Builder IP for call creation
5734 Builder.SetInsertPoint(FiniBBTI);
5735 }
5736
5737 if (!ExitCall)
5738 return Builder.saveIP();
5739
5740 // place the Exitcall as last instruction before Finalization block terminator
5741 ExitCall->removeFromParent();
5742 Builder.Insert(ExitCall);
5743
5744 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5745 ExitCall->getIterator());
5746}
5747
5749 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5750 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5751 if (!IP.isSet())
5752 return IP;
5753
5755
5756 // creates the following CFG structure
5757 // OMP_Entry : (MasterAddr != PrivateAddr)?
5758 // F T
5759 // | \
5760 // | copin.not.master
5761 // | /
5762 // v /
5763 // copyin.not.master.end
5764 // |
5765 // v
5766 // OMP.Entry.Next
5767
5768 BasicBlock *OMP_Entry = IP.getBlock();
5769 Function *CurFn = OMP_Entry->getParent();
5770 BasicBlock *CopyBegin =
5771 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5772 BasicBlock *CopyEnd = nullptr;
5773
5774 // If entry block is terminated, split to preserve the branch to following
5775 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5776 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5777 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5778 "copyin.not.master.end");
5779 OMP_Entry->getTerminator()->eraseFromParent();
5780 } else {
5781 CopyEnd =
5782 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5783 }
5784
5785 Builder.SetInsertPoint(OMP_Entry);
5786 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5787 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5788 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5789 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5790
5791 Builder.SetInsertPoint(CopyBegin);
5792 if (BranchtoEnd)
5794
5795 return Builder.saveIP();
5796}
5797
5799 Value *Size, Value *Allocator,
5800 std::string Name) {
5802 updateToLocation(Loc);
5803
5804 uint32_t SrcLocStrSize;
5805 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5806 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5807 Value *ThreadId = getOrCreateThreadID(Ident);
5808 Value *Args[] = {ThreadId, Size, Allocator};
5809
5810 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
5811
5812 return Builder.CreateCall(Fn, Args, Name);
5813}
5814
5816 Value *Addr, Value *Allocator,
5817 std::string Name) {
5819 updateToLocation(Loc);
5820
5821 uint32_t SrcLocStrSize;
5822 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5823 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5824 Value *ThreadId = getOrCreateThreadID(Ident);
5825 Value *Args[] = {ThreadId, Addr, Allocator};
5826 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
5827 return Builder.CreateCall(Fn, Args, Name);
5828}
5829
5831 const LocationDescription &Loc, Value *InteropVar,
5832 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
5833 Value *DependenceAddress, bool HaveNowaitClause) {
5835 updateToLocation(Loc);
5836
5837 uint32_t SrcLocStrSize;
5838 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5839 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5840 Value *ThreadId = getOrCreateThreadID(Ident);
5841 if (Device == nullptr)
5842 Device = ConstantInt::get(Int32, -1);
5843 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
5844 if (NumDependences == nullptr) {
5845 NumDependences = ConstantInt::get(Int32, 0);
5846 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5847 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5848 }
5849 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5850 Value *Args[] = {
5851 Ident, ThreadId, InteropVar, InteropTypeVal,
5852 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
5853
5854 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
5855
5856 return Builder.CreateCall(Fn, Args);
5857}
5858
5860 const LocationDescription &Loc, Value *InteropVar, Value *Device,
5861 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
5863 updateToLocation(Loc);
5864
5865 uint32_t SrcLocStrSize;
5866 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5867 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5868 Value *ThreadId = getOrCreateThreadID(Ident);
5869 if (Device == nullptr)
5870 Device = ConstantInt::get(Int32, -1);
5871 if (NumDependences == nullptr) {
5872 NumDependences = ConstantInt::get(Int32, 0);
5873 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5874 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5875 }
5876 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5877 Value *Args[] = {
5878 Ident, ThreadId, InteropVar, Device,
5879 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5880
5881 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
5882
5883 return Builder.CreateCall(Fn, Args);
5884}
5885
5887 Value *InteropVar, Value *Device,
5888 Value *NumDependences,
5889 Value *DependenceAddress,
5890 bool HaveNowaitClause) {
5892 updateToLocation(Loc);
5893 uint32_t SrcLocStrSize;
5894 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5895 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5896 Value *ThreadId = getOrCreateThreadID(Ident);
5897 if (Device == nullptr)
5898 Device = ConstantInt::get(Int32, -1);
5899 if (NumDependences == nullptr) {
5900 NumDependences = ConstantInt::get(Int32, 0);
5901 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5902 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5903 }
5904 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5905 Value *Args[] = {
5906 Ident, ThreadId, InteropVar, Device,
5907 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5908
5909 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
5910
5911 return Builder.CreateCall(Fn, Args);
5912}
5913
5915 const LocationDescription &Loc, llvm::Value *Pointer,
5918 updateToLocation(Loc);
5919
5920 uint32_t SrcLocStrSize;
5921 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5922 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5923 Value *ThreadId = getOrCreateThreadID(Ident);
5924 Constant *ThreadPrivateCache =
5925 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
5926 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
5927
5928 Function *Fn =
5929 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
5930
5931 return Builder.CreateCall(Fn, Args);
5932}
5933
5936 int32_t MinThreadsVal, int32_t MaxThreadsVal,
5937 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
5938 if (!updateToLocation(Loc))
5939 return Loc.IP;
5940
5941 uint32_t SrcLocStrSize;
5942 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5943 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5944 Constant *IsSPMDVal = ConstantInt::getSigned(
5946 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
5947 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
5948 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
5949
5951
5952 // Manifest the launch configuration in the metadata matching the kernel
5953 // environment.
5954 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
5955 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
5956
5957 // For max values, < 0 means unset, == 0 means set but unknown.
5958 if (MaxThreadsVal < 0)
5959 MaxThreadsVal = std::max(
5960 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
5961
5962 if (MaxThreadsVal > 0)
5963 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
5964
5965 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
5967 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
5968 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
5969 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
5970 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
5971
5972 // We need to strip the debug prefix to get the correct kernel name.
5973 StringRef KernelName = Kernel->getName();
5974 const std::string DebugPrefix = "_debug__";
5975 if (KernelName.ends_with(DebugPrefix))
5976 KernelName = KernelName.drop_back(DebugPrefix.length());
5977
5979 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
5980 const DataLayout &DL = Fn->getDataLayout();
5981
5982 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
5983 Constant *DynamicEnvironmentInitializer =
5984 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
5985 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
5986 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
5987 DynamicEnvironmentInitializer, DynamicEnvironmentName,
5988 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
5989 DL.getDefaultGlobalsAddressSpace());
5990 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
5991
5992 Constant *DynamicEnvironment =
5993 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
5994 ? DynamicEnvironmentGV
5995 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
5996 DynamicEnvironmentPtr);
5997
5998 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
5999 ConfigurationEnvironment, {
6000 UseGenericStateMachineVal,
6001 MayUseNestedParallelismVal,
6002 IsSPMDVal,
6003 MinThreads,
6004 MaxThreads,
6005 MinTeams,
6006 MaxTeams,
6007 ReductionDataSize,
6008 ReductionBufferLength,
6009 });
6010 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6011 KernelEnvironment, {
6012 ConfigurationEnvironmentInitializer,
6013 Ident,
6014 DynamicEnvironment,
6015 });
6016 std::string KernelEnvironmentName =
6017 (KernelName + "_kernel_environment").str();
6018 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6019 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6020 KernelEnvironmentInitializer, KernelEnvironmentName,
6021 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6022 DL.getDefaultGlobalsAddressSpace());
6023 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6024
6025 Constant *KernelEnvironment =
6026 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6027 ? KernelEnvironmentGV
6028 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6029 KernelEnvironmentPtr);
6030 Value *KernelLaunchEnvironment = Kernel->getArg(0);
6031 CallInst *ThreadKind =
6032 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6033
6034 Value *ExecUserCode = Builder.CreateICmpEQ(
6035 ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
6036 "exec_user_code");
6037
6038 // ThreadKind = __kmpc_target_init(...)
6039 // if (ThreadKind == -1)
6040 // user_code
6041 // else
6042 // return;
6043
6044 auto *UI = Builder.CreateUnreachable();
6045 BasicBlock *CheckBB = UI->getParent();
6046 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6047
6048 BasicBlock *WorkerExitBB = BasicBlock::Create(
6049 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6050 Builder.SetInsertPoint(WorkerExitBB);
6052
6053 auto *CheckBBTI = CheckBB->getTerminator();
6054 Builder.SetInsertPoint(CheckBBTI);
6055 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6056
6057 CheckBBTI->eraseFromParent();
6058 UI->eraseFromParent();
6059
6060 // Continue in the "user_code" block, see diagram above and in
6061 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6062 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6063}
6064
6066 int32_t TeamsReductionDataSize,
6067 int32_t TeamsReductionBufferLength) {
6068 if (!updateToLocation(Loc))
6069 return;
6070
6072 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6073
6074 Builder.CreateCall(Fn, {});
6075
6076 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6077 return;
6078
6080 // We need to strip the debug prefix to get the correct kernel name.
6081 StringRef KernelName = Kernel->getName();
6082 const std::string DebugPrefix = "_debug__";
6083 if (KernelName.ends_with(DebugPrefix))
6084 KernelName = KernelName.drop_back(DebugPrefix.length());
6085 auto *KernelEnvironmentGV =
6086 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6087 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6088 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6089 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6090 KernelEnvironmentInitializer,
6091 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6092 NewInitializer = ConstantFoldInsertValueInstruction(
6093 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6094 {0, 8});
6095 KernelEnvironmentGV->setInitializer(NewInitializer);
6096}
6097
6099 Module &M = *Kernel.getParent();
6100 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6101 for (auto *Op : MD->operands()) {
6102 if (Op->getNumOperands() != 3)
6103 continue;
6104 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6105 if (!KernelOp || KernelOp->getValue() != &Kernel)
6106 continue;
6107 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6108 if (!Prop || Prop->getString() != Name)
6109 continue;
6110 return Op;
6111 }
6112 return nullptr;
6113}
6114
6116 bool Min) {
6117 // Update the "maxntidx" metadata for NVIDIA, or add it.
6118 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6119 if (ExistingOp) {
6120 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6121 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6122 ExistingOp->replaceOperandWith(
6123 2, ConstantAsMetadata::get(ConstantInt::get(
6124 OldVal->getValue()->getType(),
6125 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6126 } else {
6127 LLVMContext &Ctx = Kernel.getContext();
6129 MDString::get(Ctx, Name),
6131 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6132 // Append metadata to nvvm.annotations
6133 Module &M = *Kernel.getParent();
6134 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6135 MD->addOperand(MDNode::get(Ctx, MDVals));
6136 }
6137}
6138
6139std::pair<int32_t, int32_t>
6141 int32_t ThreadLimit =
6142 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6143
6144 if (T.isAMDGPU()) {
6145 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6146 if (!Attr.isValid() || !Attr.isStringAttribute())
6147 return {0, ThreadLimit};
6148 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6149 int32_t LB, UB;
6150 if (!llvm::to_integer(UBStr, UB, 10))
6151 return {0, ThreadLimit};
6152 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6153 if (!llvm::to_integer(LBStr, LB, 10))
6154 return {0, UB};
6155 return {LB, UB};
6156 }
6157
6158 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6159 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6160 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6161 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6162 }
6163 return {0, ThreadLimit};
6164}
6165
6167 Function &Kernel, int32_t LB,
6168 int32_t UB) {
6169 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6170
6171 if (T.isAMDGPU()) {
6172 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6173 llvm::utostr(LB) + "," + llvm::utostr(UB));
6174 return;
6175 }
6176
6177 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6178}
6179
6180std::pair<int32_t, int32_t>
6182 // TODO: Read from backend annotations if available.
6183 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6184}
6185
6187 int32_t LB, int32_t UB) {
6188 if (T.isNVPTX())
6189 if (UB > 0)
6190 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6191 if (T.isAMDGPU())
6192 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6193
6194 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6195}
6196
6197void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6198 Function *OutlinedFn) {
6199 if (Config.isTargetDevice()) {
6201 // TODO: Determine if DSO local can be set to true.
6202 OutlinedFn->setDSOLocal(false);
6204 if (T.isAMDGCN())
6206 }
6207}
6208
6209Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6210 StringRef EntryFnIDName) {
6211 if (Config.isTargetDevice()) {
6212 assert(OutlinedFn && "The outlined function must exist if embedded");
6213 return OutlinedFn;
6214 }
6215
6216 return new GlobalVariable(
6217 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6218 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6219}
6220
6221Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6222 StringRef EntryFnName) {
6223 if (OutlinedFn)
6224 return OutlinedFn;
6225
6226 assert(!M.getGlobalVariable(EntryFnName, true) &&
6227 "Named kernel already exists?");
6228 return new GlobalVariable(
6229 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6230 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6231}
6232
6234 TargetRegionEntryInfo &EntryInfo,
6235 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6236 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6237
6238 SmallString<64> EntryFnName;
6239 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6240
6242 ? GenerateFunctionCallback(EntryFnName)
6243 : nullptr;
6244
6245 // If this target outline function is not an offload entry, we don't need to
6246 // register it. This may be in the case of a false if clause, or if there are
6247 // no OpenMP targets.
6248 if (!IsOffloadEntry)
6249 return;
6250
6251 std::string EntryFnIDName =
6253 ? std::string(EntryFnName)
6254 : createPlatformSpecificName({EntryFnName, "region_id"});
6255
6256 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6257 EntryFnName, EntryFnIDName);
6258}
6259
6261 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6262 StringRef EntryFnName, StringRef EntryFnIDName) {
6263 if (OutlinedFn)
6264 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6265 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6266 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6268 EntryInfo, EntryAddr, OutlinedFnID,
6270 return OutlinedFnID;
6271}
6272
6274 const LocationDescription &Loc, InsertPointTy AllocaIP,
6275 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6276 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6277 omp::RuntimeFunction *MapperFunc,
6278 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
6279 BodyGenCB,
6280 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6281 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6282 if (!updateToLocation(Loc))
6283 return InsertPointTy();
6284
6285 // Disable TargetData CodeGen on Device pass.
6286 if (Config.IsTargetDevice.value_or(false)) {
6287 if (BodyGenCB)
6289 return Builder.saveIP();
6290 }
6291
6292 Builder.restoreIP(CodeGenIP);
6293 bool IsStandAlone = !BodyGenCB;
6294 MapInfosTy *MapInfo;
6295 // Generate the code for the opening of the data environment. Capture all the
6296 // arguments of the runtime call by reference because they are used in the
6297 // closing of the region.
6298 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6299 MapInfo = &GenMapInfoCB(Builder.saveIP());
6300 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6301 /*IsNonContiguous=*/true, DeviceAddrCB,
6302 CustomMapperCB);
6303
6304 TargetDataRTArgs RTArgs;
6306 !MapInfo->Names.empty());
6307
6308 // Emit the number of elements in the offloading arrays.
6309 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6310
6311 // Source location for the ident struct
6312 if (!SrcLocInfo) {
6313 uint32_t SrcLocStrSize;
6314 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6315 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6316 }
6317
6318 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6319 PointerNum, RTArgs.BasePointersArray,
6320 RTArgs.PointersArray, RTArgs.SizesArray,
6321 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6322 RTArgs.MappersArray};
6323
6324 if (IsStandAlone) {
6325 assert(MapperFunc && "MapperFunc missing for standalone target data");
6327 OffloadingArgs);
6328 } else {
6329 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6330 omp::OMPRTL___tgt_target_data_begin_mapper);
6331
6332 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6333
6334 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6335 if (isa<AllocaInst>(DeviceMap.second.second)) {
6336 auto *LI =
6337 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6338 Builder.CreateStore(LI, DeviceMap.second.second);
6339 }
6340 }
6341
6342 // If device pointer privatization is required, emit the body of the
6343 // region here. It will have to be duplicated: with and without
6344 // privatization.
6346 }
6347 };
6348
6349 // If we need device pointer privatization, we need to emit the body of the
6350 // region with no privatization in the 'else' branch of the conditional.
6351 // Otherwise, we don't have to do anything.
6352 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6354 };
6355
6356 // Generate code for the closing of the data region.
6357 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6358 TargetDataRTArgs RTArgs;
6359 emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
6360 /*ForEndCall=*/true);
6361
6362 // Emit the number of elements in the offloading arrays.
6363 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6364
6365 // Source location for the ident struct
6366 if (!SrcLocInfo) {
6367 uint32_t SrcLocStrSize;
6368 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6369 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6370 }
6371
6372 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6373 PointerNum, RTArgs.BasePointersArray,
6374 RTArgs.PointersArray, RTArgs.SizesArray,
6375 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6376 RTArgs.MappersArray};
6377 Function *EndMapperFunc =
6378 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6379
6380 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6381 };
6382
6383 // We don't have to do anything to close the region if the if clause evaluates
6384 // to false.
6385 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
6386
6387 if (BodyGenCB) {
6388 if (IfCond) {
6389 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6390 } else {
6391 BeginThenGen(AllocaIP, Builder.saveIP());
6392 }
6393
6394 // If we don't require privatization of device pointers, we emit the body in
6395 // between the runtime calls. This avoids duplicating the body code.
6397
6398 if (IfCond) {
6399 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6400 } else {
6401 EndThenGen(AllocaIP, Builder.saveIP());
6402 }
6403 } else {
6404 if (IfCond) {
6405 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6406 } else {
6407 BeginThenGen(AllocaIP, Builder.saveIP());
6408 }
6409 }
6410
6411 return Builder.saveIP();
6412}
6413
6416 bool IsGPUDistribute) {
6417 assert((IVSize == 32 || IVSize == 64) &&
6418 "IV size is not compatible with the omp runtime");
6420 if (IsGPUDistribute)
6421 Name = IVSize == 32
6422 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6423 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6424 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6425 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6426 else
6427 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6428 : omp::OMPRTL___kmpc_for_static_init_4u)
6429 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6430 : omp::OMPRTL___kmpc_for_static_init_8u);
6431
6433}
6434
6436 bool IVSigned) {
6437 assert((IVSize == 32 || IVSize == 64) &&
6438 "IV size is not compatible with the omp runtime");
6439 RuntimeFunction Name = IVSize == 32
6440 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6441 : omp::OMPRTL___kmpc_dispatch_init_4u)
6442 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6443 : omp::OMPRTL___kmpc_dispatch_init_8u);
6444
6446}
6447
6449 bool IVSigned) {
6450 assert((IVSize == 32 || IVSize == 64) &&
6451 "IV size is not compatible with the omp runtime");
6452 RuntimeFunction Name = IVSize == 32
6453 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6454 : omp::OMPRTL___kmpc_dispatch_next_4u)
6455 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6456 : omp::OMPRTL___kmpc_dispatch_next_8u);
6457
6459}
6460
6462 bool IVSigned) {
6463 assert((IVSize == 32 || IVSize == 64) &&
6464 "IV size is not compatible with the omp runtime");
6465 RuntimeFunction Name = IVSize == 32
6466 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6467 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6468 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6469 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6470
6472}
6473
6475 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6479 SmallVector<Type *> ParameterTypes;
6480 if (OMPBuilder.Config.isTargetDevice()) {
6481 // Add the "implicit" runtime argument we use to provide launch specific
6482 // information for target devices.
6483 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6484 ParameterTypes.push_back(Int8PtrTy);
6485
6486 // All parameters to target devices are passed as pointers
6487 // or i64. This assumes 64-bit address spaces/pointers.
6488 for (auto &Arg : Inputs)
6489 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6490 ? Arg->getType()
6491 : Type::getInt64Ty(Builder.getContext()));
6492 } else {
6493 for (auto &Arg : Inputs)
6494 ParameterTypes.push_back(Arg->getType());
6495 }
6496
6497 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6498 /*isVarArg*/ false);
6499 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
6500 Builder.GetInsertBlock()->getModule());
6501
6502 // Save insert point.
6503 auto OldInsertPoint = Builder.saveIP();
6504
6505 // Generate the region into the function.
6506 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6507 Builder.SetInsertPoint(EntryBB);
6508
6509 // Insert target init call in the device compilation pass.
6510 if (OMPBuilder.Config.isTargetDevice())
6511 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6512
6513 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6514
6515 // As we embed the user code in the middle of our target region after we
6516 // generate entry code, we must move what allocas we can into the entry
6517 // block to avoid possible breaking optimisations for device
6518 if (OMPBuilder.Config.isTargetDevice())
6520
6521 // Insert target deinit call in the device compilation pass.
6522 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
6523 if (OMPBuilder.Config.isTargetDevice())
6524 OMPBuilder.createTargetDeinit(Builder);
6525
6526 // Insert return instruction.
6527 Builder.CreateRetVoid();
6528
6529 // New Alloca IP at entry point of created device function.
6530 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6531 auto AllocaIP = Builder.saveIP();
6532
6533 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6534
6535 // Skip the artificial dyn_ptr on the device.
6536 const auto &ArgRange =
6537 OMPBuilder.Config.isTargetDevice()
6538 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6539 : Func->args();
6540
6541 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6542 // Things like GEP's can come in the form of Constants. Constants and
6543 // ConstantExpr's do not have access to the knowledge of what they're
6544 // contained in, so we must dig a little to find an instruction so we
6545 // can tell if they're used inside of the function we're outlining. We
6546 // also replace the original constant expression with a new instruction
6547 // equivalent; an instruction as it allows easy modification in the
6548 // following loop, as we can now know the constant (instruction) is
6549 // owned by our target function and replaceUsesOfWith can now be invoked
6550 // on it (cannot do this with constants it seems). A brand new one also
6551 // allows us to be cautious as it is perhaps possible the old expression
6552 // was used inside of the function but exists and is used externally
6553 // (unlikely by the nature of a Constant, but still).
6554 // NOTE: We cannot remove dead constants that have been rewritten to
6555 // instructions at this stage, we run the risk of breaking later lowering
6556 // by doing so as we could still be in the process of lowering the module
6557 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6558 // constants we have created rewritten versions of.
6559 if (auto *Const = dyn_cast<Constant>(Input))
6560 convertUsersOfConstantsToInstructions(Const, Func, false);
6561
6562 // Collect all the instructions
6563 for (User *User : make_early_inc_range(Input->users()))
6564 if (auto *Instr = dyn_cast<Instruction>(User))
6565 if (Instr->getFunction() == Func)
6566 Instr->replaceUsesOfWith(Input, InputCopy);
6567 };
6568
6569 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6570
6571 // Rewrite uses of input valus to parameters.
6572 for (auto InArg : zip(Inputs, ArgRange)) {
6573 Value *Input = std::get<0>(InArg);
6574 Argument &Arg = std::get<1>(InArg);
6575 Value *InputCopy = nullptr;
6576
6577 Builder.restoreIP(
6578 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
6579
6580 // In certain cases a Global may be set up for replacement, however, this
6581 // Global may be used in multiple arguments to the kernel, just segmented
6582 // apart, for example, if we have a global array, that is sectioned into
6583 // multiple mappings (technically not legal in OpenMP, but there is a case
6584 // in Fortran for Common Blocks where this is neccesary), we will end up
6585 // with GEP's into this array inside the kernel, that refer to the Global
6586 // but are technically seperate arguments to the kernel for all intents and
6587 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6588 // index, it will fold into an referal to the Global, if we then encounter
6589 // this folded GEP during replacement all of the references to the
6590 // Global in the kernel will be replaced with the argument we have generated
6591 // that corresponds to it, including any other GEP's that refer to the
6592 // Global that may be other arguments. This will invalidate all of the other
6593 // preceding mapped arguments that refer to the same global that may be
6594 // seperate segments. To prevent this, we defer global processing until all
6595 // other processing has been performed.
6596 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6597 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6598 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6599 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6600 continue;
6601 }
6602
6603 ReplaceValue(Input, InputCopy, Func);
6604 }
6605
6606 // Replace all of our deferred Input values, currently just Globals.
6607 for (auto Deferred : DeferredReplacement)
6608 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6609
6610 // Restore insert point.
6611 Builder.restoreIP(OldInsertPoint);
6612
6613 return Func;
6614}
6615
6617 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6618 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
6619 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
6622
6623 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
6624 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
6625 &ArgAccessorFuncCB](StringRef EntryFnName) {
6626 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
6627 CBFunc, ArgAccessorFuncCB);
6628 };
6629
6630 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
6631 OutlinedFn, OutlinedFnID);
6632}
6633
6634static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6636 Function *OutlinedFn, Constant *OutlinedFnID,
6637 int32_t NumTeams, int32_t NumThreads,
6640
6642 /*RequiresDevicePointerInfo=*/false,
6643 /*SeparateBeginEndCalls=*/true);
6644
6645 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
6646 OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
6647 /*IsNonContiguous=*/true);
6648
6650 OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
6651 !MapInfo.Names.empty());
6652
6653 // emitKernelLaunch
6654 auto &&EmitTargetCallFallbackCB =
6656 Builder.restoreIP(IP);
6657 Builder.CreateCall(OutlinedFn, Args);
6658 return Builder.saveIP();
6659 };
6660
6661 unsigned NumTargetItems = MapInfo.BasePointers.size();
6662 // TODO: Use correct device ID
6663 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
6664 Value *NumTeamsVal = Builder.getInt32(NumTeams);
6665 Value *NumThreadsVal = Builder.getInt32(NumThreads);
6666 uint32_t SrcLocStrSize;
6667 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
6668 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
6669 llvm::omp::IdentFlag(0), 0);
6670 // TODO: Use correct NumIterations
6671 Value *NumIterations = Builder.getInt64(0);
6672 // TODO: Use correct DynCGGroupMem
6673 Value *DynCGGroupMem = Builder.getInt32(0);
6674
6675 bool HasNoWait = false;
6676
6677 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
6678 NumTeamsVal, NumThreadsVal,
6679 DynCGGroupMem, HasNoWait);
6680
6681 Builder.restoreIP(OMPBuilder.emitKernelLaunch(
6682 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
6683 DeviceID, RTLoc, AllocaIP));
6684}
6685
6687 const LocationDescription &Loc, InsertPointTy AllocaIP,
6688 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
6689 int32_t NumThreads, SmallVectorImpl<Value *> &Args,
6690 GenMapInfoCallbackTy GenMapInfoCB,
6693 if (!updateToLocation(Loc))
6694 return InsertPointTy();
6695
6696 Builder.restoreIP(CodeGenIP);
6697
6698 Function *OutlinedFn;
6699 Constant *OutlinedFnID;
6700 emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
6701 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
6702 if (!Config.isTargetDevice())
6703 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
6704 NumThreads, Args, GenMapInfoCB);
6705
6706 return Builder.saveIP();
6707}
6708
6709std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
6710 StringRef FirstSeparator,
6711 StringRef Separator) {
6712 SmallString<128> Buffer;
6714 StringRef Sep = FirstSeparator;
6715 for (StringRef Part : Parts) {
6716 OS << Sep << Part;
6717 Sep = Separator;
6718 }
6719 return OS.str().str();
6720}
6721
6722std::string
6724 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
6725 Config.separator());
6726}
6727
6730 unsigned AddressSpace) {
6731 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
6732 if (Elem.second) {
6733 assert(Elem.second->getValueType() == Ty &&
6734 "OMP internal variable has different type than requested");
6735 } else {
6736 // TODO: investigate the appropriate linkage type used for the global
6737 // variable for possibly changing that to internal or private, or maybe
6738 // create different versions of the function for different OMP internal
6739 // variables.
6740 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
6743 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
6744 Constant::getNullValue(Ty), Elem.first(),
6745 /*InsertBefore=*/nullptr,
6747 const DataLayout &DL = M.getDataLayout();
6748 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
6749 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
6750 GV->setAlignment(std::max(TypeAlign, PtrAlign));
6751 Elem.second = GV;
6752 }
6753
6754 return Elem.second;
6755}
6756
6757Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
6758 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
6759 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
6760 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
6761}
6762
6765 Value *Null =
6766 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
6767 Value *SizeGep =
6768 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
6769 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
6770 return SizePtrToInt;
6771}
6772
6775 std::string VarName) {
6776 llvm::Constant *MaptypesArrayInit =
6778 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
6779 M, MaptypesArrayInit->getType(),
6780 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
6781 VarName);
6782 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
6783 return MaptypesArrayGlobal;
6784}
6785
6787 InsertPointTy AllocaIP,
6788 unsigned NumOperands,
6789 struct MapperAllocas &MapperAllocas) {
6790 if (!updateToLocation(Loc))
6791 return;
6792
6793 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
6794 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
6795 Builder.restoreIP(AllocaIP);
6796 AllocaInst *ArgsBase = Builder.CreateAlloca(
6797 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
6798 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
6799 ".offload_ptrs");
6800 AllocaInst *ArgSizes = Builder.CreateAlloca(
6801 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
6802 Builder.restoreIP(Loc.IP);
6803 MapperAllocas.ArgsBase = ArgsBase;
6804 MapperAllocas.Args = Args;
6805 MapperAllocas.ArgSizes = ArgSizes;
6806}
6807
6809 Function *MapperFunc, Value *SrcLocInfo,
6810 Value *MaptypesArg, Value *MapnamesArg,
6812 int64_t DeviceID, unsigned NumOperands) {
6813 if (!updateToLocation(Loc))
6814 return;
6815
6816 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
6817 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
6818 Value *ArgsBaseGEP =
6820 {Builder.getInt32(0), Builder.getInt32(0)});
6821 Value *ArgsGEP =
6823 {Builder.getInt32(0), Builder.getInt32(0)});
6824 Value *ArgSizesGEP =
6826 {Builder.getInt32(0), Builder.getInt32(0)});
6827 Value *NullPtr =
6828 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
6829 Builder.CreateCall(MapperFunc,
6830 {SrcLocInfo, Builder.getInt64(DeviceID),
6831 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
6832 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
6833}
6834
6836 TargetDataRTArgs &RTArgs,
6837 TargetDataInfo &Info,
6838 bool EmitDebug,
6839 bool ForEndCall) {
6840 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
6841 "expected region end call to runtime only when end call is separate");
6842 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
6843 auto VoidPtrTy = UnqualPtrTy;
6844 auto VoidPtrPtrTy = UnqualPtrTy;
6845 auto Int64Ty = Type::getInt64Ty(M.getContext());
6846 auto Int64PtrTy = UnqualPtrTy;
6847
6848 if (!Info.NumberOfPtrs) {
6849 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
6850 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
6851 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
6852 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
6853 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
6854 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
6855 return;
6856 }
6857
6859 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
6860 Info.RTArgs.BasePointersArray,
6861 /*Idx0=*/0, /*Idx1=*/0);
6863 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
6864 /*Idx0=*/0,
6865 /*Idx1=*/0);
6867 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
6868 /*Idx0=*/0, /*Idx1=*/0);
6870 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
6871 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
6872 : Info.RTArgs.MapTypesArray,
6873 /*Idx0=*/0,
6874 /*Idx1=*/0);
6875
6876 // Only emit the mapper information arrays if debug information is
6877 // requested.
6878 if (!EmitDebug)
6879 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
6880 else
6882 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
6883 /*Idx0=*/0,
6884 /*Idx1=*/0);
6885 // If there is no user-defined mapper, set the mapper array to nullptr to
6886 // avoid an unnecessary data privatization
6887 if (!Info.HasMapper)
6888 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
6889 else
6890 RTArgs.MappersArray =
6891 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
6892}
6893
6895 InsertPointTy CodeGenIP,
6896 MapInfosTy &CombinedInfo,
6897 TargetDataInfo &Info) {
6899 CombinedInfo.NonContigInfo;
6900
6901 // Build an array of struct descriptor_dim and then assign it to
6902 // offload_args.
6903 //
6904 // struct descriptor_dim {
6905 // uint64_t offset;
6906 // uint64_t count;
6907 // uint64_t stride
6908 // };
6909 Type *Int64Ty = Builder.getInt64Ty();
6911 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
6912 "struct.descriptor_dim");
6913
6914 enum { OffsetFD = 0, CountFD, StrideFD };
6915 // We need two index variable here since the size of "Dims" is the same as
6916 // the size of Components, however, the size of offset, count, and stride is
6917 // equal to the size of base declaration that is non-contiguous.
6918 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
6919 // Skip emitting ir if dimension size is 1 since it cannot be
6920 // non-contiguous.
6921 if (NonContigInfo.Dims[I] == 1)
6922 continue;
6923 Builder.restoreIP(AllocaIP);
6924 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
6925 AllocaInst *DimsAddr =
6926 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
6927 Builder.restoreIP(CodeGenIP);
6928 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
6929 unsigned RevIdx = EE - II - 1;
6930 Value *DimsLVal = Builder.CreateInBoundsGEP(
6931 DimsAddr->getAllocatedType(), DimsAddr,
6932 {Builder.getInt64(0), Builder.getInt64(II)});
6933 // Offset
6934 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
6936 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
6937 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
6938 // Count
6939 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
6941 NonContigInfo.Counts[L][RevIdx], CountLVal,
6942 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
6943 // Stride
6944 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
6946 NonContigInfo.Strides[L][RevIdx], StrideLVal,
6947 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
6948 }
6949 // args[I] = &dims
6950 Builder.restoreIP(CodeGenIP);
6952 DimsAddr, Builder.getPtrTy());
6954 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
6955 Info.RTArgs.PointersArray, 0, I);
6958 ++L;
6959 }
6960}
6961
6963 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
6964 TargetDataInfo &Info, bool IsNonContiguous,
6965 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6966 function_ref<Value *(unsigned int)> CustomMapperCB) {
6967
6968 // Reset the array information.
6969 Info.clearArrayInfo();
6970 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
6971
6972 if (Info.NumberOfPtrs == 0)
6973 return;
6974
6975 Builder.restoreIP(AllocaIP);
6976 // Detect if we have any capture size requiring runtime evaluation of the
6977 // size so that a constant array could be eventually used.
6978 ArrayType *PointerArrayType =
6979 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
6980
6981 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
6982 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
6983
6984 Info.RTArgs.PointersArray = Builder.CreateAlloca(
6985 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
6986 AllocaInst *MappersArray = Builder.CreateAlloca(
6987 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
6988 Info.RTArgs.MappersArray = MappersArray;
6989
6990 // If we don't have any VLA types or other types that require runtime
6991 // evaluation, we can use a constant array for the map sizes, otherwise we
6992 // need to fill up the arrays as we do for the pointers.
6993 Type *Int64Ty = Builder.getInt64Ty();
6994 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
6995 ConstantInt::get(Int64Ty, 0));
6996 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
6997 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
6998 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
6999 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
7000 if (IsNonContiguous &&
7001 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7002 CombinedInfo.Types[I] &
7003 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
7004 ConstSizes[I] =
7005 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
7006 else
7007 ConstSizes[I] = CI;
7008 continue;
7009 }
7010 }
7011 RuntimeSizes.set(I);
7012 }
7013
7014 if (RuntimeSizes.all()) {
7015 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7016 Info.RTArgs.SizesArray = Builder.CreateAlloca(
7017 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7018 Builder.restoreIP(CodeGenIP);
7019 } else {
7020 auto *SizesArrayInit = ConstantArray::get(
7021 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
7022 std::string Name = createPlatformSpecificName({"offload_sizes"});
7023 auto *SizesArrayGbl =
7024 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
7025 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
7026 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
7027
7028 if (!RuntimeSizes.any()) {
7029 Info.RTArgs.SizesArray = SizesArrayGbl;
7030 } else {
7031 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7032 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
7033 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7035 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7036 Buffer->setAlignment(OffloadSizeAlign);
7037 Builder.restoreIP(CodeGenIP);
7039 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
7040 SizesArrayGbl, OffloadSizeAlign,
7042 IndexSize,
7043 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
7044
7045 Info.RTArgs.SizesArray = Buffer;
7046 }
7047 Builder.restoreIP(CodeGenIP);
7048 }
7049
7050 // The map types are always constant so we don't need to generate code to
7051 // fill arrays. Instead, we create an array constant.
7053 for (auto mapFlag : CombinedInfo.Types)
7054 Mapping.push_back(
7055 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7056 mapFlag));
7057 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
7058 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7059 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
7060
7061 // The information types are only built if provided.
7062 if (!CombinedInfo.Names.empty()) {
7063 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
7064 auto *MapNamesArrayGbl =
7065 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
7066 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
7067 } else {
7068 Info.RTArgs.MapNamesArray =
7070 }
7071
7072 // If there's a present map type modifier, it must not be applied to the end
7073 // of a region, so generate a separate map type array in that case.
7074 if (Info.separateBeginEndCalls()) {
7075 bool EndMapTypesDiffer = false;
7076 for (uint64_t &Type : Mapping) {
7077 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7078 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
7079 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7080 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
7081 EndMapTypesDiffer = true;
7082 }
7083 }
7084 if (EndMapTypesDiffer) {
7085 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7086 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
7087 }
7088 }
7089
7090 PointerType *PtrTy = Builder.getPtrTy();
7091 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
7092 Value *BPVal = CombinedInfo.BasePointers[I];
7094 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
7095 0, I);
7096 Builder.CreateAlignedStore(BPVal, BP,
7098
7099 if (Info.requiresDevicePointerInfo()) {
7100 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
7101 CodeGenIP = Builder.saveIP();
7102 Builder.restoreIP(AllocaIP);
7103 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
7104 Builder.restoreIP(CodeGenIP);
7105 if (DeviceAddrCB)
7106 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
7107 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
7108 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
7109 if (DeviceAddrCB)
7110 DeviceAddrCB(I, BP);
7111 }
7112 }
7113
7114 Value *PVal = CombinedInfo.Pointers[I];
7116 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
7117 I);
7118 // TODO: Check alignment correct.
7121
7122 if (RuntimeSizes.test(I)) {
7124 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7125 /*Idx0=*/0,
7126 /*Idx1=*/I);
7128 Int64Ty,
7129 /*isSigned=*/true),
7130 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
7131 }
7132 // Fill up the mapper array.
7133 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7134 Value *MFunc = ConstantPointerNull::get(PtrTy);
7135 if (CustomMapperCB)
7136 if (Value *CustomMFunc = CustomMapperCB(I))
7137 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
7139 MappersArray->getAllocatedType(), MappersArray,
7140 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
7142 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
7143 }
7144
7145 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
7146 Info.NumberOfPtrs == 0)
7147 return;
7148 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
7149}
7150
7153
7154 if (!CurBB || CurBB->getTerminator()) {
7155 // If there is no insert point or the previous block is already
7156 // terminated, don't touch it.
7157 } else {
7158 // Otherwise, create a fall-through branch.
7160 }
7161
7163}
7164
7166 bool IsFinished) {
7168
7169 // Fall out of the current block (if necessary).
7170 emitBranch(BB);
7171
7172 if (IsFinished && BB->use_empty()) {
7173 BB->eraseFromParent();
7174 return;
7175 }
7176
7177 // Place the block after the current block, if possible, or else at
7178 // the end of the function.
7179 if (CurBB && CurBB->getParent())
7180 CurFn->insert(std::next(CurBB->getIterator()), BB);
7181 else
7182 CurFn->insert(CurFn->end(), BB);
7184}
7185
7187 BodyGenCallbackTy ElseGen,
7188 InsertPointTy AllocaIP) {
7189 // If the condition constant folds and can be elided, try to avoid emitting
7190 // the condition and the dead arm of the if/else.
7191 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
7192 auto CondConstant = CI->getSExtValue();
7193 if (CondConstant)
7194 ThenGen(AllocaIP, Builder.saveIP());
7195 else
7196 ElseGen(AllocaIP, Builder.saveIP());
7197 return;
7198 }
7199
7201
7202 // Otherwise, the condition did not fold, or we couldn't elide it. Just
7203 // emit the conditional branch.
7204 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
7205 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
7206 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
7207 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
7208 // Emit the 'then' code.
7209 emitBlock(ThenBlock, CurFn);
7210 ThenGen(AllocaIP, Builder.saveIP());
7211 emitBranch(ContBlock);
7212 // Emit the 'else' code if present.
7213 // There is no need to emit line number for unconditional branch.
7214 emitBlock(ElseBlock, CurFn);
7215 ElseGen(AllocaIP, Builder.saveIP());
7216 // There is no need to emit line number for unconditional branch.
7217 emitBranch(ContBlock);
7218 // Emit the continuation block for code after the if.
7219 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
7220}
7221
7222bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
7223 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
7226 "Unexpected Atomic Ordering.");
7227
7228 bool Flush = false;
7230
7231 switch (AK) {
7232 case Read:
7235 FlushAO = AtomicOrdering::Acquire;
7236 Flush = true;
7237 }
7238 break;
7239 case Write:
7240 case Compare:
7241 case Update:
7244 FlushAO = AtomicOrdering::Release;
7245 Flush = true;
7246 }
7247 break;
7248 case Capture:
7249 switch (AO) {
7251 FlushAO = AtomicOrdering::Acquire;
7252 Flush = true;
7253 break;
7255 FlushAO = AtomicOrdering::Release;
7256 Flush = true;
7257 break;
7261 Flush = true;
7262 break;
7263 default:
7264 // do nothing - leave silently.
7265 break;
7266 }
7267 }
7268
7269 if (Flush) {
7270 // Currently Flush RT call still doesn't take memory_ordering, so for when
7271 // that happens, this tries to do the resolution of which atomic ordering
7272 // to use with but issue the flush call
7273 // TODO: pass `FlushAO` after memory ordering support is added
7274 (void)FlushAO;
7275 emitFlush(Loc);
7276 }
7277
7278 // for AO == AtomicOrdering::Monotonic and all other case combinations
7279 // do nothing
7280 return Flush;
7281}
7282
7286 AtomicOrdering AO) {
7287 if (!updateToLocation(Loc))
7288 return Loc.IP;
7289
7290 assert(X.Var->getType()->isPointerTy() &&
7291 "OMP Atomic expects a pointer to target memory");
7292 Type *XElemTy = X.ElemTy;
7293 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7294 XElemTy->isPointerTy()) &&
7295 "OMP atomic read expected a scalar type");
7296
7297 Value *XRead = nullptr;
7298
7299 if (XElemTy->isIntegerTy()) {
7300 LoadInst *XLD =
7301 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
7302 XLD->setAtomic(AO);
7303 XRead = cast<Value>(XLD);
7304 } else {
7305 // We need to perform atomic op as integer
7306 IntegerType *IntCastTy =
7308 LoadInst *XLoad =
7309 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
7310 XLoad->setAtomic(AO);
7311 if (XElemTy->isFloatingPointTy()) {
7312 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
7313 } else {
7314 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
7315 }
7316 }
7317 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
7318 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
7319 return Builder.saveIP();
7320}
7321
7324 AtomicOpValue &X, Value *Expr,
7325 AtomicOrdering AO) {
7326 if (!updateToLocation(Loc))
7327 return Loc.IP;
7328
7329 assert(X.Var->getType()->isPointerTy() &&
7330 "OMP Atomic expects a pointer to target memory");
7331 Type *XElemTy = X.ElemTy;
7332 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7333 XElemTy->isPointerTy()) &&
7334 "OMP atomic write expected a scalar type");
7335
7336 if (XElemTy->isIntegerTy()) {
7337 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
7338 XSt->setAtomic(AO);
7339 } else {
7340 // We need to bitcast and perform atomic op as integers
7341 IntegerType *IntCastTy =
7343 Value *ExprCast =
7344 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
7345 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
7346 XSt->setAtomic(AO);
7347 }
7348
7349 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
7350 return Builder.saveIP();
7351}
7352
7354 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7355 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
7356 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
7357 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
7358 if (!updateToLocation(Loc))
7359 return Loc.IP;
7360
7361 LLVM_DEBUG({
7362 Type *XTy = X.Var->getType();
7363 assert(XTy->isPointerTy() &&
7364 "OMP Atomic expects a pointer to target memory");
7365 Type *XElemTy = X.ElemTy;
7366 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7367 XElemTy->isPointerTy()) &&
7368 "OMP atomic update expected a scalar type");
7369 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
7370 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
7371 "OpenMP atomic does not support LT or GT operations");
7372 });
7373
7374 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
7375 X.IsVolatile, IsXBinopExpr);
7376 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
7377 return Builder.saveIP();
7378}
7379
7380// FIXME: Duplicating AtomicExpand
7381Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
7382 AtomicRMWInst::BinOp RMWOp) {
7383 switch (RMWOp) {
7384 case AtomicRMWInst::Add:
7385 return Builder.CreateAdd(Src1, Src2);
7386 case AtomicRMWInst::Sub:
7387 return Builder.CreateSub(Src1, Src2);
7388 case AtomicRMWInst::And:
7389 return Builder.CreateAnd(Src1, Src2);
7391 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
7392 case AtomicRMWInst::Or:
7393 return Builder.CreateOr(Src1, Src2);
7394 case AtomicRMWInst::Xor:
7395 return Builder.CreateXor(Src1, Src2);
7400 case AtomicRMWInst::Max:
7401 case AtomicRMWInst::Min:
7408 llvm_unreachable("Unsupported atomic update operation");
7409 }
7410 llvm_unreachable("Unsupported atomic update operation");
7411}
7412
7413std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
7414 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
7416 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
7417 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
7418 // or a complex datatype.
7419 bool emitRMWOp = false;
7420 switch (RMWOp) {
7421 case AtomicRMWInst::Add:
7422 case AtomicRMWInst::And:
7424 case AtomicRMWInst::Or:
7425 case AtomicRMWInst::Xor:
7427 emitRMWOp = XElemTy;
7428 break;
7429 case AtomicRMWInst::Sub:
7430 emitRMWOp = (IsXBinopExpr && XElemTy);
7431 break;
7432 default:
7433 emitRMWOp = false;
7434 }
7435 emitRMWOp &= XElemTy->isIntegerTy();
7436
7437 std::pair<Value *, Value *> Res;
7438 if (emitRMWOp) {
7439 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
7440 // not needed except in case of postfix captures. Generate anyway for
7441 // consistency with the else part. Will be removed with any DCE pass.
7442 // AtomicRMWInst::Xchg does not have a coressponding instruction.
7443 if (RMWOp == AtomicRMWInst::Xchg)
7444 Res.second = Res.first;
7445 else
7446 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
7447 } else {
7448 IntegerType *IntCastTy =
7450 LoadInst *OldVal =
7451 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
7452 OldVal->setAtomic(AO);
7453 // CurBB
7454 // | /---\
7455 // ContBB |
7456 // | \---/
7457 // ExitBB
7459 Instruction *CurBBTI = CurBB->getTerminator();
7460 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
7461 BasicBlock *ExitBB =
7462 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
7463 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
7464 X->getName() + ".atomic.cont");
7465 ContBB->getTerminator()->eraseFromParent();
7466 Builder.restoreIP(AllocaIP);
7467 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
7468 NewAtomicAddr->setName(X->getName() + "x.new.val");
7469 Builder.SetInsertPoint(ContBB);
7470 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
7471 PHI->addIncoming(OldVal, CurBB);
7472 bool IsIntTy = XElemTy->isIntegerTy();
7473 Value *OldExprVal = PHI;
7474 if (!IsIntTy) {
7475 if (XElemTy->isFloatingPointTy()) {
7476 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
7477 X->getName() + ".atomic.fltCast");
7478 } else {
7479 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
7480 X->getName() + ".atomic.ptrCast");
7481 }
7482 }
7483
7484 Value *Upd = UpdateOp(OldExprVal, Builder);
7485 Builder.CreateStore(Upd, NewAtomicAddr);
7486 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
7490 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
7491 Result->setVolatile(VolatileX);
7492 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
7493 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
7494 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
7495 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
7496
7497 Res.first = OldExprVal;
7498 Res.second = Upd;
7499
7500 // set Insertion point in exit block
7501 if (UnreachableInst *ExitTI =
7502 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
7503 CurBBTI->eraseFromParent();
7504 Builder.SetInsertPoint(ExitBB);
7505 } else {
7506 Builder.SetInsertPoint(ExitTI);
7507 }
7508 }
7509
7510 return Res;
7511}
7512
7514 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7515 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
7517 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
7518 if (!updateToLocation(Loc))
7519 return Loc.IP;
7520
7521 LLVM_DEBUG({
7522 Type *XTy = X.Var->getType();
7523 assert(XTy->isPointerTy() &&
7524 "OMP Atomic expects a pointer to target memory");
7525 Type *XElemTy = X.ElemTy;
7526 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7527 XElemTy->isPointerTy()) &&
7528 "OMP atomic capture expected a scalar type");
7529 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
7530 "OpenMP atomic does not support LT or GT operations");
7531 });
7532
7533 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
7534 // 'x' is simply atomically rewritten with 'expr'.
7535 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
7536 std::pair<Value *, Value *> Result =
7537 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
7538 X.IsVolatile, IsXBinopExpr);
7539
7540 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
7541 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
7542
7543 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
7544 return Builder.saveIP();
7545}
7546
7550 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
7551 bool IsFailOnly) {
7552
7554 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
7555 IsPostfixUpdate, IsFailOnly, Failure);
7556}
7557
7561 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
7562 bool IsFailOnly, AtomicOrdering Failure) {
7563
7564 if (!updateToLocation(Loc))
7565 return Loc.IP;
7566
7567 assert(X.Var->getType()->isPointerTy() &&
7568 "OMP atomic expects a pointer to target memory");
7569 // compare capture
7570 if (V.Var) {
7571 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
7572 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
7573 }
7574
7575 bool IsInteger = E->getType()->isIntegerTy();
7576
7577 if (Op == OMPAtomicCompareOp::EQ) {
7578 AtomicCmpXchgInst *Result = nullptr;
7579 if (!IsInteger) {
7580 IntegerType *IntCastTy =
7581 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
7582 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
7583 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
7584 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
7585 AO, Failure);
7586 } else {
7587 Result =
7588 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
7589 }
7590
7591 if (V.Var) {
7592 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
7593 if (!IsInteger)
7594 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
7595 assert(OldValue->getType() == V.ElemTy &&
7596 "OldValue and V must be of same type");
7597 if (IsPostfixUpdate) {
7598 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
7599 } else {
7600 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
7601 if (IsFailOnly) {
7602 // CurBB----
7603 // | |
7604 // v |
7605 // ContBB |
7606 // | |
7607 // v |
7608 // ExitBB <-
7609 //
7610 // where ContBB only contains the store of old value to 'v'.
7612 Instruction *CurBBTI = CurBB->getTerminator();
7613 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
7614 BasicBlock *ExitBB = CurBB->splitBasicBlock(
7615 CurBBTI, X.Var->getName() + ".atomic.exit");
7616 BasicBlock *ContBB = CurBB->splitBasicBlock(
7617 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
7618 ContBB->getTerminator()->eraseFromParent();
7619 CurBB->getTerminator()->eraseFromParent();
7620
7621 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
7622
7623 Builder.SetInsertPoint(ContBB);
7624 Builder.CreateStore(OldValue, V.Var);
7625 Builder.CreateBr(ExitBB);
7626
7627 if (UnreachableInst *ExitTI =
7628 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
7629 CurBBTI->eraseFromParent();
7630 Builder.SetInsertPoint(ExitBB);
7631 } else {
7632 Builder.SetInsertPoint(ExitTI);
7633 }
7634 } else {
7635 Value *CapturedValue =
7636 Builder.CreateSelect(SuccessOrFail, E, OldValue);
7637 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
7638 }
7639 }
7640 }
7641 // The comparison result has to be stored.
7642 if (R.Var) {
7643 assert(R.Var->getType()->isPointerTy() &&
7644 "r.var must be of pointer type");
7645 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
7646
7647 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
7648 Value *ResultCast = R.IsSigned
7649 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
7650 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
7651 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
7652 }
7653 } else {
7654 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
7655 "Op should be either max or min at this point");
7656 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
7657
7658 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
7659 // Let's take max as example.
7660 // OpenMP form:
7661 // x = x > expr ? expr : x;
7662 // LLVM form:
7663 // *ptr = *ptr > val ? *ptr : val;
7664 // We need to transform to LLVM form.
7665 // x = x <= expr ? x : expr;
7667 if (IsXBinopExpr) {
7668 if (IsInteger) {
7669 if (X.IsSigned)
7670 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
7672 else
7673 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
7675 } else {
7676 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
7678 }
7679 } else {
7680 if (IsInteger) {
7681 if (X.IsSigned)
7682 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
7684 else
7685 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
7687 } else {
7688 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
7690 }
7691 }
7692
7693 AtomicRMWInst *OldValue =
7694 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
7695 if (V.Var) {
7696 Value *CapturedValue = nullptr;
7697 if (IsPostfixUpdate) {
7698 CapturedValue = OldValue;
7699 } else {
7700 CmpInst::Predicate Pred;
7701 switch (NewOp) {
7702 case AtomicRMWInst::Max:
7703 Pred = CmpInst::ICMP_SGT;
7704 break;
7706 Pred = CmpInst::ICMP_UGT;
7707 break;
7709 Pred = CmpInst::FCMP_OGT;
7710 break;
7711 case AtomicRMWInst::Min:
7712 Pred = CmpInst::ICMP_SLT;
7713 break;
7715 Pred = CmpInst::ICMP_ULT;
7716 break;
7718 Pred = CmpInst::FCMP_OLT;
7719 break;
7720 default:
7721 llvm_unreachable("unexpected comparison op");
7722 }
7723 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
7724 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
7725 }
7726 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
7727 }
7728 }
7729
7730 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
7731
7732 return Builder.saveIP();
7733}
7734
7737 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
7738 Value *NumTeamsUpper, Value *ThreadLimit,
7739 Value *IfExpr) {
7740 if (!updateToLocation(Loc))
7741 return InsertPointTy();
7742
7743 uint32_t SrcLocStrSize;
7744 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7745 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7746 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
7747
7748 // Outer allocation basicblock is the entry block of the current function.
7749 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
7750 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
7751 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
7752 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
7753 }
7754
7755 // The current basic block is split into four basic blocks. After outlining,
7756 // they will be mapped as follows:
7757 // ```
7758 // def current_fn() {
7759 // current_basic_block:
7760 // br label %teams.exit
7761 // teams.exit:
7762 // ; instructions after teams
7763 // }
7764 //
7765 // def outlined_fn() {
7766 // teams.alloca:
7767 // br label %teams.body
7768 // teams.body:
7769 // ; instructions within teams body
7770 // }
7771 // ```
7772 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
7773 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
7774 BasicBlock *AllocaBB =
7775 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
7776
7777 bool SubClausesPresent =
7778 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
7779 // Push num_teams
7780 if (!Config.isTargetDevice() && SubClausesPresent) {
7781 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
7782 "if lowerbound is non-null, then upperbound must also be non-null "
7783 "for bounds on num_teams");
7784
7785 if (NumTeamsUpper == nullptr)
7786 NumTeamsUpper = Builder.getInt32(0);
7787
7788 if (NumTeamsLower == nullptr)
7789 NumTeamsLower = NumTeamsUpper;
7790
7791 if (IfExpr) {
7792 assert(IfExpr->getType()->isIntegerTy() &&
7793 "argument to if clause must be an integer value");
7794
7795 // upper = ifexpr ? upper : 1
7796 if (IfExpr->getType() != Int1)
7797 IfExpr = Builder.CreateICmpNE(IfExpr,
7798 ConstantInt::get(IfExpr->getType(), 0));
7799 NumTeamsUpper = Builder.CreateSelect(
7800 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
7801
7802 // lower = ifexpr ? lower : 1
7803 NumTeamsLower = Builder.CreateSelect(
7804 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
7805 }
7806
7807 if (ThreadLimit == nullptr)
7808 ThreadLimit = Builder.getInt32(0);
7809
7810 Value *ThreadNum = getOrCreateThreadID(Ident);
7812 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
7813 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
7814 }
7815 // Generate the body of teams.
7816 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
7817 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
7818 BodyGenCB(AllocaIP, CodeGenIP);
7819
7820 OutlineInfo OI;
7821 OI.EntryBB = AllocaBB;
7822 OI.ExitBB = ExitBB;
7823 OI.OuterAllocaBB = &OuterAllocaBB;
7824
7825 // Insert fake values for global tid and bound tid.
7826 std::stack<Instruction *> ToBeDeleted;
7827 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
7829 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
7831 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
7832
7833 auto HostPostOutlineCB = [this, Ident,
7834 ToBeDeleted](Function &OutlinedFn) mutable {
7835 // The stale call instruction will be replaced with a new call instruction
7836 // for runtime call with the outlined function.
7837
7838 assert(OutlinedFn.getNumUses() == 1 &&
7839 "there must be a single user for the outlined function");
7840 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7841 ToBeDeleted.push(StaleCI);
7842
7843 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
7844 "Outlined function must have two or three arguments only");
7845
7846 bool HasShared = OutlinedFn.arg_size() == 3;
7847
7848 OutlinedFn.getArg(0)->setName("global.tid.ptr");
7849 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
7850 if (HasShared)
7851 OutlinedFn.getArg(2)->setName("data");
7852
7853 // Call to the runtime function for teams in the current function.
7854 assert(StaleCI && "Error while outlining - no CallInst user found for the "
7855 "outlined function.");
7856 Builder.SetInsertPoint(StaleCI);
7857 SmallVector<Value *> Args = {
7858 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
7859 if (HasShared)
7860 Args.push_back(StaleCI->getArgOperand(2));
7862 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
7863 Args);
7864
7865 while (!ToBeDeleted.empty()) {
7866 ToBeDeleted.top()->eraseFromParent();
7867 ToBeDeleted.pop();
7868 }
7869 };
7870
7871 if (!Config.isTargetDevice())
7872 OI.PostOutlineCB = HostPostOutlineCB;
7873
7874 addOutlineInfo(std::move(OI));
7875
7876 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
7877
7878 return Builder.saveIP();
7879}
7880
7883 std::string VarName) {
7884 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
7886 Names.size()),
7887 Names);
7888 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
7889 M, MapNamesArrayInit->getType(),
7890 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
7891 VarName);
7892 return MapNamesArrayGlobal;
7893}
7894
7895// Create all simple and struct types exposed by the runtime and remember
7896// the llvm::PointerTypes of them for easy access later.
7897void OpenMPIRBuilder::initializeTypes(Module &M) {
7898 LLVMContext &Ctx = M.getContext();
7899 StructType *T;
7900#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
7901#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
7902 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
7903 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
7904#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
7905 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
7906 VarName##Ptr = PointerType::getUnqual(VarName);
7907#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
7908 T = StructType::getTypeByName(Ctx, StructName); \
7909 if (!T) \
7910 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
7911 VarName = T; \
7912 VarName##Ptr = PointerType::getUnqual(T);
7913#include "llvm/Frontend/OpenMP/OMPKinds.def"
7914}
7915
7918 SmallVectorImpl<BasicBlock *> &BlockVector) {
7920 BlockSet.insert(EntryBB);
7921 BlockSet.insert(ExitBB);
7922
7923 Worklist.push_back(EntryBB);
7924 while (!Worklist.empty()) {
7925 BasicBlock *BB = Worklist.pop_back_val();
7926 BlockVector.push_back(BB);
7927 for (BasicBlock *SuccBB : successors(BB))
7928 if (BlockSet.insert(SuccBB).second)
7929 Worklist.push_back(SuccBB);
7930 }
7931}
7932
7934 uint64_t Size, int32_t Flags,
7936 StringRef Name) {
7937 if (!Config.isGPU()) {
7939 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
7940 "omp_offloading_entries");
7941 return;
7942 }
7943 // TODO: Add support for global variables on the device after declare target
7944 // support.
7945 Function *Fn = dyn_cast<Function>(Addr);
7946 if (!Fn)
7947 return;
7948
7949 Module &M = *(Fn->getParent());
7950 LLVMContext &Ctx = M.getContext();
7951
7952 // Get "nvvm.annotations" metadata node.
7953 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
7954
7955 Metadata *MDVals[] = {
7956 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
7957 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
7958 // Append metadata to nvvm.annotations.
7959 MD->addOperand(MDNode::get(Ctx, MDVals));
7960
7961 // Add a function attribute for the kernel.
7962 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
7963 if (T.isAMDGCN())
7964 Fn->addFnAttr("uniform-work-group-size", "true");
7965 Fn->addFnAttr(Attribute::MustProgress);
7966}
7967
7968// We only generate metadata for function that contain target regions.
7971
7972 // If there are no entries, we don't need to do anything.
7974 return;
7975
7979 16>
7980 OrderedEntries(OffloadInfoManager.size());
7981
7982 // Auxiliary methods to create metadata values and strings.
7983 auto &&GetMDInt = [this](unsigned V) {
7984 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
7985 };
7986
7987 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
7988
7989 // Create the offloading info metadata node.
7990 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
7991 auto &&TargetRegionMetadataEmitter =
7992 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
7993 const TargetRegionEntryInfo &EntryInfo,
7995 // Generate metadata for target regions. Each entry of this metadata
7996 // contains:
7997 // - Entry 0 -> Kind of this type of metadata (0).
7998 // - Entry 1 -> Device ID of the file where the entry was identified.
7999 // - Entry 2 -> File ID of the file where the entry was identified.
8000 // - Entry 3 -> Mangled name of the function where the entry was
8001 // identified.
8002 // - Entry 4 -> Line in the file where the entry was identified.
8003 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
8004 // - Entry 6 -> Order the entry was created.
8005 // The first element of the metadata node is the kind.
8006 Metadata *Ops[] = {
8007 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
8008 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
8009 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
8010 GetMDInt(E.getOrder())};
8011
8012 // Save this entry in the right position of the ordered entries array.
8013 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
8014
8015 // Add metadata to the named metadata node.
8016 MD->addOperand(MDNode::get(C, Ops));
8017 };
8018
8019 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
8020
8021 // Create function that emits metadata for each device global variable entry;
8022 auto &&DeviceGlobalVarMetadataEmitter =
8023 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
8024 StringRef MangledName,
8026 // Generate metadata for global variables. Each entry of this metadata
8027 // contains:
8028 // - Entry 0 -> Kind of this type of metadata (1).
8029 // - Entry 1 -> Mangled name of the variable.
8030 // - Entry 2 -> Declare target kind.
8031 // - Entry 3 -> Order the entry was created.
8032 // The first element of the metadata node is the kind.
8033 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
8034 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
8035
8036 // Save this entry in the right position of the ordered entries array.
8037 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
8038 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
8039
8040 // Add metadata to the named metadata node.
8041 MD->addOperand(MDNode::get(C, Ops));
8042 };
8043
8045 DeviceGlobalVarMetadataEmitter);
8046
8047 for (const auto &E : OrderedEntries) {
8048 assert(E.first && "All ordered entries must exist!");
8049 if (const auto *CE =
8050 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
8051 E.first)) {
8052 if (!CE->getID() || !CE->getAddress()) {
8053 // Do not blame the entry if the parent funtion is not emitted.
8054 TargetRegionEntryInfo EntryInfo = E.second;
8055 StringRef FnName = EntryInfo.ParentName;
8056 if (!M.getNamedValue(FnName))
8057 continue;
8058 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
8059 continue;
8060 }
8061 createOffloadEntry(CE->getID(), CE->getAddress(),
8062 /*Size=*/0, CE->getFlags(),
8064 } else if (const auto *CE = dyn_cast<
8066 E.first)) {
8069 CE->getFlags());
8070 switch (Flags) {
8074 continue;
8075 if (!CE->getAddress()) {
8076 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
8077 continue;
8078 }
8079 // The vaiable has no definition - no need to add the entry.
8080 if (CE->getVarSize() == 0)
8081 continue;
8082 break;
8084 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
8085 (!Config.isTargetDevice() && CE->getAddress())) &&
8086 "Declaret target link address is set.");
8087 if (Config.isTargetDevice())
8088 continue;
8089 if (!CE->getAddress()) {
8091 continue;
8092 }
8093 break;
8094 default:
8095 break;
8096 }
8097
8098 // Hidden or internal symbols on the device are not externally visible.
8099 // We should not attempt to register them by creating an offloading
8100 // entry. Indirect variables are handled separately on the device.
8101 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
8102 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
8104 continue;
8105
8106 // Indirect globals need to use a special name that doesn't match the name
8107 // of the associated host global.
8109 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8110 Flags, CE->getLinkage(), CE->getVarName());
8111 else
8112 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8113 Flags, CE->getLinkage());
8114
8115 } else {
8116 llvm_unreachable("Unsupported entry kind.");
8117 }
8118 }
8119
8120 // Emit requires directive globals to a special entry so the runtime can
8121 // register them when the device image is loaded.
8122 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
8123 // entries should be redesigned to better suit this use-case.
8127 /*Name=*/"",
8129 Config.getRequiresFlags(), "omp_offloading_entries");
8130}
8131
8133 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
8134 unsigned FileID, unsigned Line, unsigned Count) {
8136 OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
8137 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
8138 if (Count)
8139 OS << "_" << Count;
8140}
8141
8144 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
8146 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
8147 EntryInfo.Line, NewCount);
8148}
8149
8152 StringRef ParentName) {
8154 auto FileIDInfo = CallBack();
8155 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
8156 report_fatal_error(("Unable to get unique ID for file, during "
8157 "getTargetEntryUniqueInfo, error message: " +
8158 EC.message())
8159 .c_str());
8160 }
8161
8162 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
8163 std::get<1>(FileIDInfo));
8164}
8165
8167 unsigned Offset = 0;
8168 for (uint64_t Remain =
8169 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8171 !(Remain & 1); Remain = Remain >> 1)
8172 Offset++;
8173 return Offset;
8174}
8175
8178 // Rotate by getFlagMemberOffset() bits.
8179 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
8180 << getFlagMemberOffset());
8181}
8182
8185 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
8186 // If the entry is PTR_AND_OBJ but has not been marked with the special
8187 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
8188 // marked as MEMBER_OF.
8189 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8191 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8194 return;
8195
8196 // Reset the placeholder value to prepare the flag for the assignment of the
8197 // proper MEMBER_OF value.
8198 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
8199 Flags |= MemberOfFlag;
8200}
8201
8205 bool IsDeclaration, bool IsExternallyVisible,
8206 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8207 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8208 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
8209 std::function<Constant *()> GlobalInitializer,
8210 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
8211 // TODO: convert this to utilise the IRBuilder Config rather than
8212 // a passed down argument.
8213 if (OpenMPSIMD)
8214 return nullptr;
8215
8218 CaptureClause ==
8221 SmallString<64> PtrName;
8222 {
8223 raw_svector_ostream OS(PtrName);
8224 OS << MangledName;
8225 if (!IsExternallyVisible)
8226 OS << format("_%x", EntryInfo.FileID);
8227 OS << "_decl_tgt_ref_ptr";
8228 }
8229
8230 Value *Ptr = M.getNamedValue(PtrName);
8231
8232 if (!Ptr) {
8233 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
8234 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
8235
8236 auto *GV = cast<GlobalVariable>(Ptr);
8237 GV->setLinkage(GlobalValue::WeakAnyLinkage);
8238
8239 if (!Config.isTargetDevice()) {
8240 if (GlobalInitializer)
8241 GV->setInitializer(GlobalInitializer());
8242 else
8243 GV->setInitializer(GlobalValue);
8244 }
8245
8247 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8248 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8249 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
8250 }
8251
8252 return cast<Constant>(Ptr);
8253 }
8254
8255 return nullptr;
8256}
8257
8261 bool IsDeclaration, bool IsExternallyVisible,
8262 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8263 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8264 std::vector<Triple> TargetTriple,
8265 std::function<Constant *()> GlobalInitializer,
8266 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
8267 Constant *Addr) {
8269 (TargetTriple.empty() && !Config.isTargetDevice()))
8270 return;
8271
8273 StringRef VarName;
8274 int64_t VarSize;
8276
8278 CaptureClause ==
8282 VarName = MangledName;
8283 GlobalValue *LlvmVal = M.getNamedValue(VarName);
8284
8285 if (!IsDeclaration)
8286 VarSize = divideCeil(
8288 else
8289 VarSize = 0;
8290 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
8291
8292 // This is a workaround carried over from Clang which prevents undesired
8293 // optimisation of internal variables.
8294 if (Config.isTargetDevice() &&
8295 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
8296 // Do not create a "ref-variable" if the original is not also available
8297 // on the host.
8299 return;
8300
8301 std::string RefName = createPlatformSpecificName({VarName, "ref"});
8302
8303 if (!M.getNamedValue(RefName)) {
8304 Constant *AddrRef =
8305 getOrCreateInternalVariable(Addr->getType(), RefName);
8306 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
8307 GvAddrRef->setConstant(true);
8308 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
8309 GvAddrRef->setInitializer(Addr);
8310 GeneratedRefs.push_back(GvAddrRef);
8311 }
8312 }
8313 } else {
8316 else
8318
8319 if (Config.isTargetDevice()) {
8320 VarName = (Addr) ? Addr->getName() : "";
8321 Addr = nullptr;
8322 } else {
8324 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8325 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8326 LlvmPtrTy, GlobalInitializer, VariableLinkage);
8327 VarName = (Addr) ? Addr->getName() : "";
8328 }
8329 VarSize = M.getDataLayout().getPointerSize();
8331 }
8332
8334 Flags, Linkage);
8335}
8336
8337/// Loads all the offload entries information from the host IR
8338/// metadata.
8340 // If we are in target mode, load the metadata from the host IR. This code has
8341 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
8342
8344 if (!MD)
8345 return;
8346
8347 for (MDNode *MN : MD->operands()) {
8348 auto &&GetMDInt = [MN](unsigned Idx) {
8349 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
8350 return cast<ConstantInt>(V->getValue())->getZExtValue();
8351 };
8352
8353 auto &&GetMDString = [MN](unsigned Idx) {
8354 auto *V = cast<MDString>(MN->getOperand(Idx));
8355 return V->getString();
8356 };
8357
8358 switch (GetMDInt(0)) {
8359 default:
8360 llvm_unreachable("Unexpected metadata!");
8361 break;
8364 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
8365 /*DeviceID=*/GetMDInt(1),
8366 /*FileID=*/GetMDInt(2),
8367 /*Line=*/GetMDInt(4),
8368 /*Count=*/GetMDInt(5));
8370 /*Order=*/GetMDInt(6));
8371 break;
8372 }
8376 /*MangledName=*/GetMDString(1),
8378 /*Flags=*/GetMDInt(2)),
8379 /*Order=*/GetMDInt(3));
8380 break;
8381 }
8382 }
8383}
8384
8386 if (HostFilePath.empty())
8387 return;
8388
8389 auto Buf = MemoryBuffer::getFile(HostFilePath);
8390 if (std::error_code Err = Buf.getError()) {
8391 report_fatal_error(("error opening host file from host file path inside of "
8392 "OpenMPIRBuilder: " +
8393 Err.message())
8394 .c_str());
8395 }
8396
8397 LLVMContext Ctx;
8399 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
8400 if (std::error_code Err = M.getError()) {
8402 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
8403 .c_str());
8404 }
8405
8406 loadOffloadInfoMetadata(*M.get());
8407}
8408
8409//===----------------------------------------------------------------------===//
8410// OffloadEntriesInfoManager
8411//===----------------------------------------------------------------------===//
8412
8414 return OffloadEntriesTargetRegion.empty() &&
8415 OffloadEntriesDeviceGlobalVar.empty();
8416}
8417
8418unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
8419 const TargetRegionEntryInfo &EntryInfo) const {
8420 auto It = OffloadEntriesTargetRegionCount.find(
8421 getTargetRegionEntryCountKey(EntryInfo));
8422 if (It == OffloadEntriesTargetRegionCount.end())
8423 return 0;
8424 return It->second;
8425}
8426
8427void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
8428 const TargetRegionEntryInfo &EntryInfo) {
8429 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
8430 EntryInfo.Count + 1;
8431}
8432
8433/// Initialize target region entry.
8435 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
8436 OffloadEntriesTargetRegion[EntryInfo] =
8437 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
8438 OMPTargetRegionEntryTargetRegion);
8439 ++OffloadingEntriesNum;
8440}
8441
8445 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
8446
8447 // Update the EntryInfo with the next available count for this location.
8448 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8449
8450 // If we are emitting code for a target, the entry is already initialized,
8451 // only has to be registered.
8452 if (OMPBuilder->Config.isTargetDevice()) {
8453 // This could happen if the device compilation is invoked standalone.
8454 if (!hasTargetRegionEntryInfo(EntryInfo)) {
8455 return;
8456 }
8457 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
8458 Entry.setAddress(Addr);
8459 Entry.setID(ID);
8460 Entry.setFlags(Flags);
8461 } else {
8463 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
8464 return;
8465 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
8466 "Target region entry already registered!");
8467 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
8468 OffloadEntriesTargetRegion[EntryInfo] = Entry;
8469 ++OffloadingEntriesNum;
8470 }
8471 incrementTargetRegionEntryInfoCount(EntryInfo);
8472}
8473
8475 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
8476
8477 // Update the EntryInfo with the next available count for this location.
8478 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8479
8480 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
8481 if (It == OffloadEntriesTargetRegion.end()) {
8482 return false;
8483 }
8484 // Fail if this entry is already registered.
8485 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
8486 return false;
8487 return true;
8488}
8489
8491 const OffloadTargetRegionEntryInfoActTy &Action) {
8492 // Scan all target region entries and perform the provided action.
8493 for (const auto &It : OffloadEntriesTargetRegion) {
8494 Action(It.first, It.second);
8495 }
8496}
8497
8499 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
8500 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
8501 ++OffloadingEntriesNum;
8502}
8503
8505 StringRef VarName, Constant *Addr, int64_t VarSize,
8507 if (OMPBuilder->Config.isTargetDevice()) {
8508 // This could happen if the device compilation is invoked standalone.
8509 if (!hasDeviceGlobalVarEntryInfo(VarName))
8510 return;
8511 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
8512 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
8513 if (Entry.getVarSize() == 0) {
8514 Entry.setVarSize(VarSize);
8515 Entry.setLinkage(Linkage);
8516 }
8517 return;
8518 }
8519 Entry.setVarSize(VarSize);
8520 Entry.setLinkage(Linkage);
8521 Entry.setAddress(Addr);
8522 } else {
8523 if (hasDeviceGlobalVarEntryInfo(VarName)) {
8524 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
8525 assert(Entry.isValid() && Entry.getFlags() == Flags &&
8526 "Entry not initialized!");
8527 if (Entry.getVarSize() == 0) {
8528 Entry.setVarSize(VarSize);
8529 Entry.setLinkage(Linkage);
8530 }
8531 return;
8532 }
8534 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
8535 Addr, VarSize, Flags, Linkage,
8536 VarName.str());
8537 else
8538 OffloadEntriesDeviceGlobalVar.try_emplace(
8539 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
8540 ++OffloadingEntriesNum;
8541 }
8542}
8543
8546 // Scan all target region entries and perform the provided action.
8547 for (const auto &E : OffloadEntriesDeviceGlobalVar)
8548 Action(E.getKey(), E.getValue());
8549}
8550
8551//===----------------------------------------------------------------------===//
8552// CanonicalLoopInfo
8553//===----------------------------------------------------------------------===//
8554
8555void CanonicalLoopInfo::collectControlBlocks(
8557 // We only count those BBs as control block for which we do not need to
8558 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
8559 // flow. For consistency, this also means we do not add the Body block, which
8560 // is just the entry to the body code.
8561 BBs.reserve(BBs.size() + 6);
8562 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
8563}
8564
8566 assert(isValid() && "Requires a valid canonical loop");
8567 for (BasicBlock *Pred : predecessors(Header)) {
8568 if (Pred != Latch)
8569 return Pred;
8570 }
8571 llvm_unreachable("Missing preheader");
8572}
8573
8574void CanonicalLoopInfo::setTripCount(Value *TripCount) {
8575 assert(isValid() && "Requires a valid canonical loop");
8576
8577 Instruction *CmpI = &getCond()->front();
8578 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
8579 CmpI->setOperand(1, TripCount);
8580
8581#ifndef NDEBUG
8582 assertOK();
8583#endif
8584}
8585
8586void CanonicalLoopInfo::mapIndVar(
8587 llvm::function_ref<Value *(Instruction *)> Updater) {
8588 assert(isValid() && "Requires a valid canonical loop");
8589
8590 Instruction *OldIV = getIndVar();
8591
8592 // Record all uses excluding those introduced by the updater. Uses by the
8593 // CanonicalLoopInfo itself to keep track of the number of iterations are
8594 // excluded.
8595 SmallVector<Use *> ReplacableUses;
8596 for (Use &U : OldIV->uses()) {
8597 auto *User = dyn_cast<Instruction>(U.getUser());
8598 if (!User)
8599 continue;
8600 if (User->getParent() == getCond())
8601 continue;
8602 if (User->getParent() == getLatch())
8603 continue;
8604 ReplacableUses.push_back(&U);
8605 }
8606
8607 // Run the updater that may introduce new uses
8608 Value *NewIV = Updater(OldIV);
8609
8610 // Replace the old uses with the value returned by the updater.
8611 for (Use *U : ReplacableUses)
8612 U->set(NewIV);
8613
8614#ifndef NDEBUG
8615 assertOK();
8616#endif
8617}
8618
8620#ifndef NDEBUG
8621 // No constraints if this object currently does not describe a loop.
8622 if (!isValid())
8623 return;
8624
8625 BasicBlock *Preheader = getPreheader();
8626 BasicBlock *Body = getBody();
8627 BasicBlock *After = getAfter();
8628
8629 // Verify standard control-flow we use for OpenMP loops.
8630 assert(Preheader);
8631 assert(isa<BranchInst>(Preheader->getTerminator()) &&
8632 "Preheader must terminate with unconditional branch");
8633 assert(Preheader->getSingleSuccessor() == Header &&
8634 "Preheader must jump to header");
8635
8636 assert(Header);
8637 assert(isa<BranchInst>(Header->getTerminator()) &&
8638 "Header must terminate with unconditional branch");
8639 assert(Header->getSingleSuccessor() == Cond &&
8640 "Header must jump to exiting block");
8641
8642 assert(Cond);
8643 assert(Cond->getSinglePredecessor() == Header &&
8644 "Exiting block only reachable from header");
8645
8646 assert(isa<BranchInst>(Cond->getTerminator()) &&
8647 "Exiting block must terminate with conditional branch");
8648 assert(size(successors(Cond)) == 2 &&
8649 "Exiting block must have two successors");
8650 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
8651 "Exiting block's first successor jump to the body");
8652 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
8653 "Exiting block's second successor must exit the loop");
8654
8655 assert(Body);
8656 assert(Body->getSinglePredecessor() == Cond &&
8657 "Body only reachable from exiting block");
8658 assert(!isa<PHINode>(Body->front()));
8659
8660 assert(Latch);
8661 assert(isa<BranchInst>(Latch->getTerminator()) &&
8662 "Latch must terminate with unconditional branch");
8663 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
8664 // TODO: To support simple redirecting of the end of the body code that has
8665 // multiple; introduce another auxiliary basic block like preheader and after.
8666 assert(Latch->getSinglePredecessor() != nullptr);
8667 assert(!isa<PHINode>(Latch->front()));
8668
8669 assert(Exit);
8670 assert(isa<BranchInst>(Exit->getTerminator()) &&
8671 "Exit block must terminate with unconditional branch");
8672 assert(Exit->getSingleSuccessor() == After &&
8673 "Exit block must jump to after block");
8674
8675 assert(After);
8676 assert(After->getSinglePredecessor() == Exit &&
8677 "After block only reachable from exit block");
8678 assert(After->empty() || !isa<PHINode>(After->front()));
8679
8680 Instruction *IndVar = getIndVar();
8681 assert(IndVar && "Canonical induction variable not found?");
8682 assert(isa<IntegerType>(IndVar->getType()) &&
8683 "Induction variable must be an integer");
8684 assert(cast<PHINode>(IndVar)->getParent() == Header &&
8685 "Induction variable must be a PHI in the loop header");
8686 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
8687 assert(
8688 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
8689 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
8690
8691 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
8692 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
8693 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
8694 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
8695 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
8696 ->isOne());
8697
8698 Value *TripCount = getTripCount();
8699 assert(TripCount && "Loop trip count not found?");
8700 assert(IndVar->getType() == TripCount->getType() &&
8701 "Trip count and induction variable must have the same type");
8702
8703 auto *CmpI = cast<CmpInst>(&Cond->front());
8704 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
8705 "Exit condition must be a signed less-than comparison");
8706 assert(CmpI->getOperand(0) == IndVar &&
8707 "Exit condition must compare the induction variable");
8708 assert(CmpI->getOperand(1) == TripCount &&
8709 "Exit condition must compare with the trip count");
8710#endif
8711}
8712
8714 Header = nullptr;
8715 Cond = nullptr;
8716 Latch = nullptr;
8717 Exit = nullptr;
8718}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Rewrite undef for PHI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
Value * createFakeIntVal(IRBuilder<> &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, std::stack< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:77
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:60
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:96
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:114
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:101
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:125
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:92
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:467
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
Class to represent array types.
Definition: DerivedTypes.h:371
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:494
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:643
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:707
@ Add
*p = old + v
Definition: Instructions.h:711
@ FAdd
*p = old + v
Definition: Instructions.h:732
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:725
@ Or
*p = old | v
Definition: Instructions.h:719
@ Sub
*p = old - v
Definition: Instructions.h:713
@ And
*p = old & v
Definition: Instructions.h:715
@ Xor
*p = old ^ v
Definition: Instructions.h:721
@ FSub
*p = old - v
Definition: Instructions.h:735
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:747
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:723
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:729
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:743
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:727
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:739
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:751
@ Nand
*p = ~(old & v)
Definition: Instructions.h:717
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:577
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:865
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:850
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:391
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:451
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:414
reverse_iterator rbegin()
Definition: BasicBlock.h:454
bool empty() const
Definition: BasicBlock.h:460
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:365
const Instruction & front() const
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:202
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:575
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:495
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:457
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:169
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:465
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:487
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:277
reverse_iterator rend()
Definition: BasicBlock.h:456
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:384
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:366
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:621
const Instruction & back() const
Definition: BasicBlock.h:463
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:290
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:514
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1385
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1391
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2900
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:706
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2177
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2192
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2257
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1762
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Debug location.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:294
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:533
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:276
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:750
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
Class to represent function types.
Definition: DerivedTypes.h:103
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:600
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:165
const BasicBlock & getEntryBlock() const
Definition: Function.h:800
bool empty() const
Definition: Function.h:822
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:414
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:716
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:728
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
const Function & getFunction() const
Definition: Function.h:163
iterator begin()
Definition: Function.h:816
arg_iterator arg_begin()
Definition: Function.h:831
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:353
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:628
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:745
size_t arg_size() const
Definition: Function.h:864
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:212
iterator end()
Definition: Function.h:818
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:278
Argument * getArg(unsigned i) const
Definition: Function.h:849
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1521
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:52
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:255
BasicBlock * getBlock() const
Definition: IRBuilder.h:270
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:268
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:271
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:92
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1106
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2255
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1839
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1771
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:537
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2263
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1805
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2037
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1261
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2168
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2514
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1306
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1090
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:173
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1969
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:579
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2031
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2120
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:524
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1334
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:172
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:218
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:529
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1872
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2180
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1376
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2243
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:519
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1864
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:489
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1719
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:275
Constant * CreateGlobalStringPtr(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Same as CreateGlobalString, but return a pointer with "i8*" type instead of a pointer to array of i8.
Definition: IRBuilder.h:1990
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:484
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2364
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2395
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1141
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2239
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:143
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:63
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1342
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2125
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:495
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1118
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1788
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2019
LLVMContext & getContext() const
Definition: IRBuilder.h:174
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1473
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1088
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1910
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1956
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1801
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1325
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2115
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2547
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1852
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2005
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1495
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:567
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1112
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:167
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2271
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:479
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2251
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2194
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:287
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2542
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:178
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:562
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1824
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2410
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1454
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1517
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2349
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:514
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1402
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:657
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2052
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2130
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1359
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:476
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:948
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:473
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:173
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:238
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:120
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
NamedMDNode * getNamedMetadata(const Twine &Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:262
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:284
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:297
iterator_range< global_iterator > globals()
Definition: Module.h:701
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:613
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:446
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:135
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:271
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:461
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
A tuple of MDNodes.
Definition: Metadata.h:1729
iterator_range< op_iterator > operands()
Definition: Metadata.h:1825
void addOperand(MDNode *M)
Definition: Metadata.cpp:1387
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:235
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:237
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:368
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:370
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:288
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:290
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:279
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:348
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:354
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:360
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:358
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:352
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:350
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:424
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:91
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:183
StringRef separator() const
Definition: OMPIRBuilder.h:169
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:159
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:104
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:142
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:136
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:179
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:465
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:511
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
InsertPointTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool EmitDebug=false, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:491
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
InsertPointTy createTarget(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB)
Generator for '#omp target'.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
void setAlignment(Align Align)
Definition: Instructions.h:332
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:359
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:693
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:444
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:609
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:513
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:953
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1011
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1021
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:127
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:143
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:926
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:812
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:64
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:788
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
AddressSpace
Definition: NVPTXBaseInfo.h:21
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:872
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:607
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * NumTeams
The number of teams.
Value * DynCGGroupMem
The size of the dynamic shared memory.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
Value * NumThreads
The number of threads.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:197
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61