LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533
534 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
535
536 Value *DynCGroupMemFallbackFlag =
537 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
538 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
539 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
540
541 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
542
543 Value *NumTeams3D =
544 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
545 Value *NumThreads3D =
546 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
547 for (unsigned I :
548 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
549 NumTeams3D =
550 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
551 for (unsigned I :
552 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
553 NumThreads3D =
554 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
555
556 ArgsVector = {Version,
557 PointerNum,
558 KernelArgs.RTArgs.BasePointersArray,
559 KernelArgs.RTArgs.PointersArray,
560 KernelArgs.RTArgs.SizesArray,
561 KernelArgs.RTArgs.MapTypesArray,
562 KernelArgs.RTArgs.MapNamesArray,
563 KernelArgs.RTArgs.MappersArray,
564 KernelArgs.NumIterations,
565 Flags,
566 NumTeams3D,
567 NumThreads3D,
568 KernelArgs.DynCGroupMem};
569}
570
571void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
572 LLVMContext &Ctx = Fn.getContext();
573
574 // Get the function's current attributes.
575 auto Attrs = Fn.getAttributes();
576 auto FnAttrs = Attrs.getFnAttrs();
577 auto RetAttrs = Attrs.getRetAttrs();
579 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
580 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
581
582 // Add AS to FnAS while taking special care with integer extensions.
583 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
584 bool Param = true) -> void {
585 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
586 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
587 if (HasSignExt || HasZeroExt) {
588 assert(AS.getNumAttributes() == 1 &&
589 "Currently not handling extension attr combined with others.");
590 if (Param) {
591 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
592 FnAS = FnAS.addAttribute(Ctx, AK);
593 } else if (auto AK =
594 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
595 FnAS = FnAS.addAttribute(Ctx, AK);
596 } else {
597 FnAS = FnAS.addAttributes(Ctx, AS);
598 }
599 };
600
601#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
602#include "llvm/Frontend/OpenMP/OMPKinds.def"
603
604 // Add attributes to the function declaration.
605 switch (FnID) {
606#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
607 case Enum: \
608 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
609 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
610 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
611 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
612 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
613 break;
614#include "llvm/Frontend/OpenMP/OMPKinds.def"
615 default:
616 // Attributes are optional.
617 break;
618 }
619}
620
622OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
623 FunctionType *FnTy = nullptr;
624 Function *Fn = nullptr;
625
626 // Try to find the declation in the module first.
627 switch (FnID) {
628#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
629 case Enum: \
630 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
631 IsVarArg); \
632 Fn = M.getFunction(Str); \
633 break;
634#include "llvm/Frontend/OpenMP/OMPKinds.def"
635 }
636
637 if (!Fn) {
638 // Create a new declaration if we need one.
639 switch (FnID) {
640#define OMP_RTL(Enum, Str, ...) \
641 case Enum: \
642 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
643 break;
644#include "llvm/Frontend/OpenMP/OMPKinds.def"
645 }
646
647 // Add information if the runtime function takes a callback function
648 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
649 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
650 LLVMContext &Ctx = Fn->getContext();
651 MDBuilder MDB(Ctx);
652 // Annotate the callback behavior of the runtime function:
653 // - The callback callee is argument number 2 (microtask).
654 // - The first two arguments of the callback callee are unknown (-1).
655 // - All variadic arguments to the runtime function are passed to the
656 // callback callee.
657 Fn->addMetadata(
658 LLVMContext::MD_callback,
659 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
660 2, {-1, -1}, /* VarArgsArePassed */ true)}));
661 }
662 }
663
664 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
665 << " with type " << *Fn->getFunctionType() << "\n");
666 addAttributes(FnID, *Fn);
667
668 } else {
669 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
670 << " with type " << *Fn->getFunctionType() << "\n");
671 }
672
673 assert(Fn && "Failed to create OpenMP runtime function");
674
675 return {FnTy, Fn};
676}
677
678Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
679 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
680 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
681 assert(Fn && "Failed to create OpenMP runtime function pointer");
682 return Fn;
683}
684
685void OpenMPIRBuilder::initialize() { initializeTypes(M); }
686
689 BasicBlock &EntryBlock = Function->getEntryBlock();
690 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
691
692 // Loop over blocks looking for constant allocas, skipping the entry block
693 // as any allocas there are already in the desired location.
694 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
695 Block++) {
696 for (auto Inst = Block->getReverseIterator()->begin();
697 Inst != Block->getReverseIterator()->end();) {
699 Inst++;
701 continue;
702 AllocaInst->moveBeforePreserving(MoveLocInst);
703 } else {
704 Inst++;
705 }
706 }
707 }
708}
709
710void OpenMPIRBuilder::finalize(Function *Fn) {
711 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
713 SmallVector<OutlineInfo, 16> DeferredOutlines;
714 for (OutlineInfo &OI : OutlineInfos) {
715 // Skip functions that have not finalized yet; may happen with nested
716 // function generation.
717 if (Fn && OI.getFunction() != Fn) {
718 DeferredOutlines.push_back(OI);
719 continue;
720 }
721
722 ParallelRegionBlockSet.clear();
723 Blocks.clear();
724 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
725
726 Function *OuterFn = OI.getFunction();
727 CodeExtractorAnalysisCache CEAC(*OuterFn);
728 // If we generate code for the target device, we need to allocate
729 // struct for aggregate params in the device default alloca address space.
730 // OpenMP runtime requires that the params of the extracted functions are
731 // passed as zero address space pointers. This flag ensures that
732 // CodeExtractor generates correct code for extracted functions
733 // which are used by OpenMP runtime.
734 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
735 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
736 /* AggregateArgs */ true,
737 /* BlockFrequencyInfo */ nullptr,
738 /* BranchProbabilityInfo */ nullptr,
739 /* AssumptionCache */ nullptr,
740 /* AllowVarArgs */ true,
741 /* AllowAlloca */ true,
742 /* AllocaBlock*/ OI.OuterAllocaBB,
743 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
744
745 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
746 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
747 << " Exit: " << OI.ExitBB->getName() << "\n");
748 assert(Extractor.isEligible() &&
749 "Expected OpenMP outlining to be possible!");
750
751 for (auto *V : OI.ExcludeArgsFromAggregate)
752 Extractor.excludeArgFromAggregate(V);
753
754 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
755
756 // Forward target-cpu, target-features attributes to the outlined function.
757 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
758 if (TargetCpuAttr.isStringAttribute())
759 OutlinedFn->addFnAttr(TargetCpuAttr);
760
761 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
762 if (TargetFeaturesAttr.isStringAttribute())
763 OutlinedFn->addFnAttr(TargetFeaturesAttr);
764
765 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
766 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
767 assert(OutlinedFn->getReturnType()->isVoidTy() &&
768 "OpenMP outlined functions should not return a value!");
769
770 // For compability with the clang CG we move the outlined function after the
771 // one with the parallel region.
772 OutlinedFn->removeFromParent();
773 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
774
775 // Remove the artificial entry introduced by the extractor right away, we
776 // made our own entry block after all.
777 {
778 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
779 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
780 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
781 // Move instructions from the to-be-deleted ArtificialEntry to the entry
782 // basic block of the parallel region. CodeExtractor generates
783 // instructions to unwrap the aggregate argument and may sink
784 // allocas/bitcasts for values that are solely used in the outlined region
785 // and do not escape.
786 assert(!ArtificialEntry.empty() &&
787 "Expected instructions to add in the outlined region entry");
788 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
789 End = ArtificialEntry.rend();
790 It != End;) {
791 Instruction &I = *It;
792 It++;
793
794 if (I.isTerminator()) {
795 // Absorb any debug value that terminator may have
796 if (OI.EntryBB->getTerminator())
797 OI.EntryBB->getTerminator()->adoptDbgRecords(
798 &ArtificialEntry, I.getIterator(), false);
799 continue;
800 }
801
802 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
803 }
804
805 OI.EntryBB->moveBefore(&ArtificialEntry);
806 ArtificialEntry.eraseFromParent();
807 }
808 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
809 assert(OutlinedFn && OutlinedFn->hasNUses(1));
810
811 // Run a user callback, e.g. to add attributes.
812 if (OI.PostOutlineCB)
813 OI.PostOutlineCB(*OutlinedFn);
814 }
815
816 // Remove work items that have been completed.
817 OutlineInfos = std::move(DeferredOutlines);
818
819 // The createTarget functions embeds user written code into
820 // the target region which may inject allocas which need to
821 // be moved to the entry block of our target or risk malformed
822 // optimisations by later passes, this is only relevant for
823 // the device pass which appears to be a little more delicate
824 // when it comes to optimisations (however, we do not block on
825 // that here, it's up to the inserter to the list to do so).
826 // This notbaly has to occur after the OutlinedInfo candidates
827 // have been extracted so we have an end product that will not
828 // be implicitly adversely affected by any raises unless
829 // intentionally appended to the list.
830 // NOTE: This only does so for ConstantData, it could be extended
831 // to ConstantExpr's with further effort, however, they should
832 // largely be folded when they get here. Extending it to runtime
833 // defined/read+writeable allocation sizes would be non-trivial
834 // (need to factor in movement of any stores to variables the
835 // allocation size depends on, as well as the usual loads,
836 // otherwise it'll yield the wrong result after movement) and
837 // likely be more suitable as an LLVM optimisation pass.
838 for (Function *F : ConstantAllocaRaiseCandidates)
840
841 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
842 [](EmitMetadataErrorKind Kind,
843 const TargetRegionEntryInfo &EntryInfo) -> void {
844 errs() << "Error of kind: " << Kind
845 << " when emitting offload entries and metadata during "
846 "OMPIRBuilder finalization \n";
847 };
848
849 if (!OffloadInfoManager.empty())
850 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
851
852 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
853 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
854 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
855 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
856 }
857
858 IsFinalized = true;
859}
860
861bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
862
863OpenMPIRBuilder::~OpenMPIRBuilder() {
864 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
865}
866
867GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
868 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
869 auto *GV =
870 new GlobalVariable(M, I32Ty,
871 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
872 ConstantInt::get(I32Ty, Value), Name);
873 GV->setVisibility(GlobalValue::HiddenVisibility);
874
875 return GV;
876}
877
878void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
879 if (List.empty())
880 return;
881
882 // Convert List to what ConstantArray needs.
884 UsedArray.resize(List.size());
885 for (unsigned I = 0, E = List.size(); I != E; ++I)
887 cast<Constant>(&*List[I]), Builder.getPtrTy());
888
889 if (UsedArray.empty())
890 return;
891 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
892
893 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
894 ConstantArray::get(ATy, UsedArray), Name);
895
896 GV->setSection("llvm.metadata");
897}
898
900OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
902 auto *Int8Ty = Builder.getInt8Ty();
903 auto *GVMode = new GlobalVariable(
904 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
905 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
906 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
907 return GVMode;
908}
909
910Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
911 uint32_t SrcLocStrSize,
912 IdentFlag LocFlags,
913 unsigned Reserve2Flags) {
914 // Enable "C-mode".
915 LocFlags |= OMP_IDENT_FLAG_KMPC;
916
917 Constant *&Ident =
918 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
919 if (!Ident) {
921 Constant *IdentData[] = {I32Null,
922 ConstantInt::get(Int32, uint32_t(LocFlags)),
923 ConstantInt::get(Int32, Reserve2Flags),
924 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
925
926 size_t SrcLocStrArgIdx = 4;
927 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
929 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
930 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
931 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
932 Constant *Initializer =
933 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
934
935 // Look for existing encoding of the location + flags, not needed but
936 // minimizes the difference to the existing solution while we transition.
937 for (GlobalVariable &GV : M.globals())
938 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
939 if (GV.getInitializer() == Initializer)
940 Ident = &GV;
941
942 if (!Ident) {
943 auto *GV = new GlobalVariable(
944 M, OpenMPIRBuilder::Ident,
945 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
947 M.getDataLayout().getDefaultGlobalsAddressSpace());
948 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
949 GV->setAlignment(Align(8));
950 Ident = GV;
951 }
952 }
953
955}
956
957Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
958 uint32_t &SrcLocStrSize) {
959 SrcLocStrSize = LocStr.size();
960 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
961 if (!SrcLocStr) {
962 Constant *Initializer =
963 ConstantDataArray::getString(M.getContext(), LocStr);
964
965 // Look for existing encoding of the location, not needed but minimizes the
966 // difference to the existing solution while we transition.
967 for (GlobalVariable &GV : M.globals())
968 if (GV.isConstant() && GV.hasInitializer() &&
969 GV.getInitializer() == Initializer)
970 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
971
972 SrcLocStr = Builder.CreateGlobalString(
973 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
974 &M);
975 }
976 return SrcLocStr;
977}
978
979Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
980 StringRef FileName,
981 unsigned Line, unsigned Column,
982 uint32_t &SrcLocStrSize) {
983 SmallString<128> Buffer;
984 Buffer.push_back(';');
985 Buffer.append(FileName);
986 Buffer.push_back(';');
987 Buffer.append(FunctionName);
988 Buffer.push_back(';');
989 Buffer.append(std::to_string(Line));
990 Buffer.push_back(';');
991 Buffer.append(std::to_string(Column));
992 Buffer.push_back(';');
993 Buffer.push_back(';');
994 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
995}
996
997Constant *
998OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
999 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1000 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1001}
1002
1003Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
1004 uint32_t &SrcLocStrSize,
1005 Function *F) {
1006 DILocation *DIL = DL.get();
1007 if (!DIL)
1008 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1009 StringRef FileName = M.getName();
1010 if (DIFile *DIF = DIL->getFile())
1011 if (std::optional<StringRef> Source = DIF->getSource())
1012 FileName = *Source;
1013 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1014 if (Function.empty() && F)
1015 Function = F->getName();
1016 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1017 DIL->getColumn(), SrcLocStrSize);
1018}
1019
1020Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1021 uint32_t &SrcLocStrSize) {
1022 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1023 Loc.IP.getBlock()->getParent());
1024}
1025
1026Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1027 return Builder.CreateCall(
1028 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1029 "omp_global_thread_num");
1030}
1031
1032OpenMPIRBuilder::InsertPointOrErrorTy
1033OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1034 bool ForceSimpleCall, bool CheckCancelFlag) {
1035 if (!updateToLocation(Loc))
1036 return Loc.IP;
1037
1038 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1039 // __kmpc_barrier(loc, thread_id);
1040
1041 IdentFlag BarrierLocFlags;
1042 switch (Kind) {
1043 case OMPD_for:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1045 break;
1046 case OMPD_sections:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1048 break;
1049 case OMPD_single:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1051 break;
1052 case OMPD_barrier:
1053 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1054 break;
1055 default:
1056 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1057 break;
1058 }
1059
1060 uint32_t SrcLocStrSize;
1061 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1062 Value *Args[] = {
1063 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1064 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1065
1066 // If we are in a cancellable parallel region, barriers are cancellation
1067 // points.
1068 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1069 bool UseCancelBarrier =
1070 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1071
1072 Value *Result =
1073 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1074 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1075 : OMPRTL___kmpc_barrier),
1076 Args);
1077
1078 if (UseCancelBarrier && CheckCancelFlag)
1079 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1080 return Err;
1081
1082 return Builder.saveIP();
1083}
1084
1085OpenMPIRBuilder::InsertPointOrErrorTy
1086OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1087 Value *IfCondition,
1088 omp::Directive CanceledDirective) {
1089 if (!updateToLocation(Loc))
1090 return Loc.IP;
1091
1092 // LLVM utilities like blocks with terminators.
1093 auto *UI = Builder.CreateUnreachable();
1094
1095 Instruction *ThenTI = UI, *ElseTI = nullptr;
1096 if (IfCondition)
1097 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1098 Builder.SetInsertPoint(ThenTI);
1099
1100 Value *CancelKind = nullptr;
1101 switch (CanceledDirective) {
1102#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1103 case DirectiveEnum: \
1104 CancelKind = Builder.getInt32(Value); \
1105 break;
1106#include "llvm/Frontend/OpenMP/OMPKinds.def"
1107 default:
1108 llvm_unreachable("Unknown cancel kind!");
1109 }
1110
1111 uint32_t SrcLocStrSize;
1112 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1113 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1114 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1115 Value *Result = Builder.CreateCall(
1116 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1117 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1118 if (CanceledDirective == OMPD_parallel) {
1119 IRBuilder<>::InsertPointGuard IPG(Builder);
1120 Builder.restoreIP(IP);
1121 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1122 omp::Directive::OMPD_unknown,
1123 /* ForceSimpleCall */ false,
1124 /* CheckCancelFlag */ false)
1125 .takeError();
1126 }
1127 return Error::success();
1128 };
1129
1130 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1131 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1132 return Err;
1133
1134 // Update the insertion point and remove the terminator we introduced.
1135 Builder.SetInsertPoint(UI->getParent());
1136 UI->eraseFromParent();
1137
1138 return Builder.saveIP();
1139}
1140
1141OpenMPIRBuilder::InsertPointOrErrorTy
1142OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1143 omp::Directive CanceledDirective) {
1144 if (!updateToLocation(Loc))
1145 return Loc.IP;
1146
1147 // LLVM utilities like blocks with terminators.
1148 auto *UI = Builder.CreateUnreachable();
1149 Builder.SetInsertPoint(UI);
1150
1151 Value *CancelKind = nullptr;
1152 switch (CanceledDirective) {
1153#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1154 case DirectiveEnum: \
1155 CancelKind = Builder.getInt32(Value); \
1156 break;
1157#include "llvm/Frontend/OpenMP/OMPKinds.def"
1158 default:
1159 llvm_unreachable("Unknown cancel kind!");
1160 }
1161
1162 uint32_t SrcLocStrSize;
1163 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1164 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1165 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1166 Value *Result = Builder.CreateCall(
1167 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1168 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1169 if (CanceledDirective == OMPD_parallel) {
1170 IRBuilder<>::InsertPointGuard IPG(Builder);
1171 Builder.restoreIP(IP);
1172 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1173 omp::Directive::OMPD_unknown,
1174 /* ForceSimpleCall */ false,
1175 /* CheckCancelFlag */ false)
1176 .takeError();
1177 }
1178 return Error::success();
1179 };
1180
1181 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1182 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1183 return Err;
1184
1185 // Update the insertion point and remove the terminator we introduced.
1186 Builder.SetInsertPoint(UI->getParent());
1187 UI->eraseFromParent();
1188
1189 return Builder.saveIP();
1190}
1191
1192OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1193 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1194 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1195 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1196 if (!updateToLocation(Loc))
1197 return Loc.IP;
1198
1199 Builder.restoreIP(AllocaIP);
1200 auto *KernelArgsPtr =
1201 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1202 updateToLocation(Loc);
1203
1204 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1205 llvm::Value *Arg =
1206 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1207 Builder.CreateAlignedStore(
1208 KernelArgs[I], Arg,
1209 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1210 }
1211
1212 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1213 NumThreads, HostPtr, KernelArgsPtr};
1214
1215 Return = Builder.CreateCall(
1216 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1217 OffloadingArgs);
1218
1219 return Builder.saveIP();
1220}
1221
1222OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1223 const LocationDescription &Loc, Value *OutlinedFnID,
1224 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1225 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1226
1227 if (!updateToLocation(Loc))
1228 return Loc.IP;
1229
1230 // On top of the arrays that were filled up, the target offloading call
1231 // takes as arguments the device id as well as the host pointer. The host
1232 // pointer is used by the runtime library to identify the current target
1233 // region, so it only has to be unique and not necessarily point to
1234 // anything. It could be the pointer to the outlined function that
1235 // implements the target region, but we aren't using that so that the
1236 // compiler doesn't need to keep that, and could therefore inline the host
1237 // function if proven worthwhile during optimization.
1238
1239 // From this point on, we need to have an ID of the target region defined.
1240 assert(OutlinedFnID && "Invalid outlined function ID!");
1241 (void)OutlinedFnID;
1242
1243 // Return value of the runtime offloading call.
1244 Value *Return = nullptr;
1245
1246 // Arguments for the target kernel.
1247 SmallVector<Value *> ArgsVector;
1248 getKernelArgsVector(Args, Builder, ArgsVector);
1249
1250 // The target region is an outlined function launched by the runtime
1251 // via calls to __tgt_target_kernel().
1252 //
1253 // Note that on the host and CPU targets, the runtime implementation of
1254 // these calls simply call the outlined function without forking threads.
1255 // The outlined functions themselves have runtime calls to
1256 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1257 // the compiler in emitTeamsCall() and emitParallelCall().
1258 //
1259 // In contrast, on the NVPTX target, the implementation of
1260 // __tgt_target_teams() launches a GPU kernel with the requested number
1261 // of teams and threads so no additional calls to the runtime are required.
1262 // Check the error code and execute the host version if required.
1263 Builder.restoreIP(emitTargetKernel(
1264 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1265 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1266
1267 BasicBlock *OffloadFailedBlock =
1268 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1269 BasicBlock *OffloadContBlock =
1270 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1271 Value *Failed = Builder.CreateIsNotNull(Return);
1272 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1273
1274 auto CurFn = Builder.GetInsertBlock()->getParent();
1275 emitBlock(OffloadFailedBlock, CurFn);
1276 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1277 if (!AfterIP)
1278 return AfterIP.takeError();
1279 Builder.restoreIP(*AfterIP);
1280 emitBranch(OffloadContBlock);
1281 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1282 return Builder.saveIP();
1283}
1284
1285Error OpenMPIRBuilder::emitCancelationCheckImpl(
1286 Value *CancelFlag, omp::Directive CanceledDirective,
1287 FinalizeCallbackTy ExitCB) {
1288 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1289 "Unexpected cancellation!");
1290
1291 // For a cancel barrier we create two new blocks.
1292 BasicBlock *BB = Builder.GetInsertBlock();
1293 BasicBlock *NonCancellationBlock;
1294 if (Builder.GetInsertPoint() == BB->end()) {
1295 // TODO: This branch will not be needed once we moved to the
1296 // OpenMPIRBuilder codegen completely.
1297 NonCancellationBlock = BasicBlock::Create(
1298 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1299 } else {
1300 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1302 Builder.SetInsertPoint(BB);
1303 }
1304 BasicBlock *CancellationBlock = BasicBlock::Create(
1305 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1306
1307 // Jump to them based on the return value.
1308 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1309 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1310 /* TODO weight */ nullptr, nullptr);
1311
1312 // From the cancellation block we finalize all variables and go to the
1313 // post finalization block that is known to the FiniCB callback.
1314 Builder.SetInsertPoint(CancellationBlock);
1315 if (ExitCB)
1316 if (Error Err = ExitCB(Builder.saveIP()))
1317 return Err;
1318 auto &FI = FinalizationStack.back();
1319 if (Error Err = FI.FiniCB(Builder.saveIP()))
1320 return Err;
1321
1322 // The continuation block is where code generation continues.
1323 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1324 return Error::success();
1325}
1326
1327// Callback used to create OpenMP runtime calls to support
1328// omp parallel clause for the device.
1329// We need to use this callback to replace call to the OutlinedFn in OuterFn
1330// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1332 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1333 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1334 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1335 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1336 // Add some known attributes.
1337 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1338 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1339 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1340 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1341 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1342 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1343
1344 assert(OutlinedFn.arg_size() >= 2 &&
1345 "Expected at least tid and bounded tid as arguments");
1346 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1347
1348 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1349 assert(CI && "Expected call instruction to outlined function");
1350 CI->getParent()->setName("omp_parallel");
1351
1352 Builder.SetInsertPoint(CI);
1353 Type *PtrTy = OMPIRBuilder->VoidPtr;
1354 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1355
1356 // Add alloca for kernel args
1357 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1358 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1359 AllocaInst *ArgsAlloca =
1360 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1361 Value *Args = ArgsAlloca;
1362 // Add address space cast if array for storing arguments is not allocated
1363 // in address space 0
1364 if (ArgsAlloca->getAddressSpace())
1365 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1366 Builder.restoreIP(CurrentIP);
1367
1368 // Store captured vars which are used by kmpc_parallel_51
1369 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1370 Value *V = *(CI->arg_begin() + 2 + Idx);
1371 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1372 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1373 Builder.CreateStore(V, StoreAddress);
1374 }
1375
1376 Value *Cond =
1377 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1378 : Builder.getInt32(1);
1379
1380 // Build kmpc_parallel_51 call
1381 Value *Parallel51CallArgs[] = {
1382 /* identifier*/ Ident,
1383 /* global thread num*/ ThreadID,
1384 /* if expression */ Cond,
1385 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1386 /* Proc bind */ Builder.getInt32(-1),
1387 /* outlined function */ &OutlinedFn,
1388 /* wrapper function */ NullPtrValue,
1389 /* arguments of the outlined funciton*/ Args,
1390 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1391
1392 FunctionCallee RTLFn =
1393 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1394
1395 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1396
1397 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1398 << *Builder.GetInsertBlock()->getParent() << "\n");
1399
1400 // Initialize the local TID stack location with the argument value.
1401 Builder.SetInsertPoint(PrivTID);
1402 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1403 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1404 PrivTIDAddr);
1405
1406 // Remove redundant call to the outlined function.
1407 CI->eraseFromParent();
1408
1409 for (Instruction *I : ToBeDeleted) {
1410 I->eraseFromParent();
1411 }
1412}
1413
1414// Callback used to create OpenMP runtime calls to support
1415// omp parallel clause for the host.
1416// We need to use this callback to replace call to the OutlinedFn in OuterFn
1417// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1418static void
1419hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1420 Function *OuterFn, Value *Ident, Value *IfCondition,
1421 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1422 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1423 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1424 FunctionCallee RTLFn;
1425 if (IfCondition) {
1426 RTLFn =
1427 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1428 } else {
1429 RTLFn =
1430 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1431 }
1432 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1433 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1434 LLVMContext &Ctx = F->getContext();
1435 MDBuilder MDB(Ctx);
1436 // Annotate the callback behavior of the __kmpc_fork_call:
1437 // - The callback callee is argument number 2 (microtask).
1438 // - The first two arguments of the callback callee are unknown (-1).
1439 // - All variadic arguments to the __kmpc_fork_call are passed to the
1440 // callback callee.
1441 F->addMetadata(LLVMContext::MD_callback,
1443 2, {-1, -1},
1444 /* VarArgsArePassed */ true)}));
1445 }
1446 }
1447 // Add some known attributes.
1448 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1449 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1450 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1451
1452 assert(OutlinedFn.arg_size() >= 2 &&
1453 "Expected at least tid and bounded tid as arguments");
1454 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1455
1456 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1457 CI->getParent()->setName("omp_parallel");
1458 Builder.SetInsertPoint(CI);
1459
1460 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1461 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1462 &OutlinedFn};
1463
1464 SmallVector<Value *, 16> RealArgs;
1465 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1466 if (IfCondition) {
1467 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1468 RealArgs.push_back(Cond);
1469 }
1470 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1471
1472 // __kmpc_fork_call_if always expects a void ptr as the last argument
1473 // If there are no arguments, pass a null pointer.
1474 auto PtrTy = OMPIRBuilder->VoidPtr;
1475 if (IfCondition && NumCapturedVars == 0) {
1476 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1477 RealArgs.push_back(NullPtrValue);
1478 }
1479
1480 Builder.CreateCall(RTLFn, RealArgs);
1481
1482 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1483 << *Builder.GetInsertBlock()->getParent() << "\n");
1484
1485 // Initialize the local TID stack location with the argument value.
1486 Builder.SetInsertPoint(PrivTID);
1487 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1488 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1489 PrivTIDAddr);
1490
1491 // Remove redundant call to the outlined function.
1492 CI->eraseFromParent();
1493
1494 for (Instruction *I : ToBeDeleted) {
1495 I->eraseFromParent();
1496 }
1497}
1498
1499OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1500 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1501 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1502 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1503 omp::ProcBindKind ProcBind, bool IsCancellable) {
1504 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1505
1506 if (!updateToLocation(Loc))
1507 return Loc.IP;
1508
1509 uint32_t SrcLocStrSize;
1510 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1511 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1512 Value *ThreadID = getOrCreateThreadID(Ident);
1513 // If we generate code for the target device, we need to allocate
1514 // struct for aggregate params in the device default alloca address space.
1515 // OpenMP runtime requires that the params of the extracted functions are
1516 // passed as zero address space pointers. This flag ensures that extracted
1517 // function arguments are declared in zero address space
1518 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1519
1520 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1521 // only if we compile for host side.
1522 if (NumThreads && !Config.isTargetDevice()) {
1523 Value *Args[] = {
1524 Ident, ThreadID,
1525 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1526 Builder.CreateCall(
1527 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1528 }
1529
1530 if (ProcBind != OMP_PROC_BIND_default) {
1531 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1532 Value *Args[] = {
1533 Ident, ThreadID,
1534 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1535 Builder.CreateCall(
1536 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1537 }
1538
1539 BasicBlock *InsertBB = Builder.GetInsertBlock();
1540 Function *OuterFn = InsertBB->getParent();
1541
1542 // Save the outer alloca block because the insertion iterator may get
1543 // invalidated and we still need this later.
1544 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1545
1546 // Vector to remember instructions we used only during the modeling but which
1547 // we want to delete at the end.
1549
1550 // Change the location to the outer alloca insertion point to create and
1551 // initialize the allocas we pass into the parallel region.
1552 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1553 Builder.restoreIP(NewOuter);
1554 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1555 AllocaInst *ZeroAddrAlloca =
1556 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1557 Instruction *TIDAddr = TIDAddrAlloca;
1558 Instruction *ZeroAddr = ZeroAddrAlloca;
1559 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1560 // Add additional casts to enforce pointers in zero address space
1561 TIDAddr = new AddrSpaceCastInst(
1562 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1563 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1564 ToBeDeleted.push_back(TIDAddr);
1565 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1566 PointerType ::get(M.getContext(), 0),
1567 "zero.addr.ascast");
1568 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1569 ToBeDeleted.push_back(ZeroAddr);
1570 }
1571
1572 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1573 // associated arguments in the outlined function, so we delete them later.
1574 ToBeDeleted.push_back(TIDAddrAlloca);
1575 ToBeDeleted.push_back(ZeroAddrAlloca);
1576
1577 // Create an artificial insertion point that will also ensure the blocks we
1578 // are about to split are not degenerated.
1579 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1580
1581 BasicBlock *EntryBB = UI->getParent();
1582 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1583 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1584 BasicBlock *PRegPreFiniBB =
1585 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1586 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1587
1588 auto FiniCBWrapper = [&](InsertPointTy IP) {
1589 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1590 // target to the region exit block.
1591 if (IP.getBlock()->end() == IP.getPoint()) {
1592 IRBuilder<>::InsertPointGuard IPG(Builder);
1593 Builder.restoreIP(IP);
1594 Instruction *I = Builder.CreateBr(PRegExitBB);
1595 IP = InsertPointTy(I->getParent(), I->getIterator());
1596 }
1598 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1599 "Unexpected insertion point for finalization call!");
1600 return FiniCB(IP);
1601 };
1602
1603 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1604
1605 // Generate the privatization allocas in the block that will become the entry
1606 // of the outlined function.
1607 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1608 InsertPointTy InnerAllocaIP = Builder.saveIP();
1609
1610 AllocaInst *PrivTIDAddr =
1611 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1612 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1613
1614 // Add some fake uses for OpenMP provided arguments.
1615 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1616 Instruction *ZeroAddrUse =
1617 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1618 ToBeDeleted.push_back(ZeroAddrUse);
1619
1620 // EntryBB
1621 // |
1622 // V
1623 // PRegionEntryBB <- Privatization allocas are placed here.
1624 // |
1625 // V
1626 // PRegionBodyBB <- BodeGen is invoked here.
1627 // |
1628 // V
1629 // PRegPreFiniBB <- The block we will start finalization from.
1630 // |
1631 // V
1632 // PRegionExitBB <- A common exit to simplify block collection.
1633 //
1634
1635 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1636
1637 // Let the caller create the body.
1638 assert(BodyGenCB && "Expected body generation callback!");
1639 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1640 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1641 return Err;
1642
1643 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1644
1645 OutlineInfo OI;
1646 if (Config.isTargetDevice()) {
1647 // Generate OpenMP target specific runtime call
1648 OI.PostOutlineCB = [=, ToBeDeletedVec =
1649 std::move(ToBeDeleted)](Function &OutlinedFn) {
1650 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1651 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1652 ThreadID, ToBeDeletedVec);
1653 };
1654 } else {
1655 // Generate OpenMP host runtime call
1656 OI.PostOutlineCB = [=, ToBeDeletedVec =
1657 std::move(ToBeDeleted)](Function &OutlinedFn) {
1658 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1659 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1660 };
1661 }
1662
1663 OI.OuterAllocaBB = OuterAllocaBlock;
1664 OI.EntryBB = PRegEntryBB;
1665 OI.ExitBB = PRegExitBB;
1666
1667 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1669 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1670
1671 CodeExtractorAnalysisCache CEAC(*OuterFn);
1672 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1673 /* AggregateArgs */ false,
1674 /* BlockFrequencyInfo */ nullptr,
1675 /* BranchProbabilityInfo */ nullptr,
1676 /* AssumptionCache */ nullptr,
1677 /* AllowVarArgs */ true,
1678 /* AllowAlloca */ true,
1679 /* AllocationBlock */ OuterAllocaBlock,
1680 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1681
1682 // Find inputs to, outputs from the code region.
1683 BasicBlock *CommonExit = nullptr;
1684 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1685 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1686
1687 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1688 /*CollectGlobalInputs=*/true);
1689
1690 Inputs.remove_if([&](Value *I) {
1692 return GV->getValueType() == OpenMPIRBuilder::Ident;
1693
1694 return false;
1695 });
1696
1697 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1698
1699 FunctionCallee TIDRTLFn =
1700 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1701
1702 auto PrivHelper = [&](Value &V) -> Error {
1703 if (&V == TIDAddr || &V == ZeroAddr) {
1704 OI.ExcludeArgsFromAggregate.push_back(&V);
1705 return Error::success();
1706 }
1707
1709 for (Use &U : V.uses())
1710 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1711 if (ParallelRegionBlockSet.count(UserI->getParent()))
1712 Uses.insert(&U);
1713
1714 // __kmpc_fork_call expects extra arguments as pointers. If the input
1715 // already has a pointer type, everything is fine. Otherwise, store the
1716 // value onto stack and load it back inside the to-be-outlined region. This
1717 // will ensure only the pointer will be passed to the function.
1718 // FIXME: if there are more than 15 trailing arguments, they must be
1719 // additionally packed in a struct.
1720 Value *Inner = &V;
1721 if (!V.getType()->isPointerTy()) {
1722 IRBuilder<>::InsertPointGuard Guard(Builder);
1723 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1724
1725 Builder.restoreIP(OuterAllocaIP);
1726 Value *Ptr =
1727 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1728
1729 // Store to stack at end of the block that currently branches to the entry
1730 // block of the to-be-outlined region.
1731 Builder.SetInsertPoint(InsertBB,
1732 InsertBB->getTerminator()->getIterator());
1733 Builder.CreateStore(&V, Ptr);
1734
1735 // Load back next to allocations in the to-be-outlined region.
1736 Builder.restoreIP(InnerAllocaIP);
1737 Inner = Builder.CreateLoad(V.getType(), Ptr);
1738 }
1739
1740 Value *ReplacementValue = nullptr;
1741 CallInst *CI = dyn_cast<CallInst>(&V);
1742 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1743 ReplacementValue = PrivTID;
1744 } else {
1745 InsertPointOrErrorTy AfterIP =
1746 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1747 if (!AfterIP)
1748 return AfterIP.takeError();
1749 Builder.restoreIP(*AfterIP);
1750 InnerAllocaIP = {
1751 InnerAllocaIP.getBlock(),
1752 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1753
1754 assert(ReplacementValue &&
1755 "Expected copy/create callback to set replacement value!");
1756 if (ReplacementValue == &V)
1757 return Error::success();
1758 }
1759
1760 for (Use *UPtr : Uses)
1761 UPtr->set(ReplacementValue);
1762
1763 return Error::success();
1764 };
1765
1766 // Reset the inner alloca insertion as it will be used for loading the values
1767 // wrapped into pointers before passing them into the to-be-outlined region.
1768 // Configure it to insert immediately after the fake use of zero address so
1769 // that they are available in the generated body and so that the
1770 // OpenMP-related values (thread ID and zero address pointers) remain leading
1771 // in the argument list.
1772 InnerAllocaIP = IRBuilder<>::InsertPoint(
1773 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1774
1775 // Reset the outer alloca insertion point to the entry of the relevant block
1776 // in case it was invalidated.
1777 OuterAllocaIP = IRBuilder<>::InsertPoint(
1778 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1779
1780 for (Value *Input : Inputs) {
1781 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1782 if (Error Err = PrivHelper(*Input))
1783 return Err;
1784 }
1785 LLVM_DEBUG({
1786 for (Value *Output : Outputs)
1787 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1788 });
1789 assert(Outputs.empty() &&
1790 "OpenMP outlining should not produce live-out values!");
1791
1792 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1793 LLVM_DEBUG({
1794 for (auto *BB : Blocks)
1795 dbgs() << " PBR: " << BB->getName() << "\n";
1796 });
1797
1798 // Adjust the finalization stack, verify the adjustment, and call the
1799 // finalize function a last time to finalize values between the pre-fini
1800 // block and the exit block if we left the parallel "the normal way".
1801 auto FiniInfo = FinalizationStack.pop_back_val();
1802 (void)FiniInfo;
1803 assert(FiniInfo.DK == OMPD_parallel &&
1804 "Unexpected finalization stack state!");
1805
1806 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1807
1808 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1809 if (Error Err = FiniCB(PreFiniIP))
1810 return Err;
1811
1812 // Register the outlined info.
1813 addOutlineInfo(std::move(OI));
1814
1815 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1816 UI->eraseFromParent();
1817
1818 return AfterIP;
1819}
1820
1821void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1822 // Build call void __kmpc_flush(ident_t *loc)
1823 uint32_t SrcLocStrSize;
1824 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1825 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1826
1827 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1828}
1829
1830void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1831 if (!updateToLocation(Loc))
1832 return;
1833 emitFlush(Loc);
1834}
1835
1836void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1837 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1838 // global_tid);
1839 uint32_t SrcLocStrSize;
1840 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1841 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1842 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1843
1844 // Ignore return result until untied tasks are supported.
1845 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1846 Args);
1847}
1848
1849void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1850 if (!updateToLocation(Loc))
1851 return;
1852 emitTaskwaitImpl(Loc);
1853}
1854
1855void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1856 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1857 uint32_t SrcLocStrSize;
1858 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1859 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1861 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1862
1863 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1864 Args);
1865}
1866
1867void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1868 if (!updateToLocation(Loc))
1869 return;
1870 emitTaskyieldImpl(Loc);
1871}
1872
1873// Processes the dependencies in Dependencies and does the following
1874// - Allocates space on the stack of an array of DependInfo objects
1875// - Populates each DependInfo object with relevant information of
1876// the corresponding dependence.
1877// - All code is inserted in the entry block of the current function.
1879 OpenMPIRBuilder &OMPBuilder,
1881 // Early return if we have no dependencies to process
1882 if (Dependencies.empty())
1883 return nullptr;
1884
1885 // Given a vector of DependData objects, in this function we create an
1886 // array on the stack that holds kmp_dep_info objects corresponding
1887 // to each dependency. This is then passed to the OpenMP runtime.
1888 // For example, if there are 'n' dependencies then the following psedo
1889 // code is generated. Assume the first dependence is on a variable 'a'
1890 //
1891 // \code{c}
1892 // DepArray = alloc(n x sizeof(kmp_depend_info);
1893 // idx = 0;
1894 // DepArray[idx].base_addr = ptrtoint(&a);
1895 // DepArray[idx].len = 8;
1896 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1897 // ++idx;
1898 // DepArray[idx].base_addr = ...;
1899 // \endcode
1900
1901 IRBuilderBase &Builder = OMPBuilder.Builder;
1902 Type *DependInfo = OMPBuilder.DependInfo;
1903 Module &M = OMPBuilder.M;
1904
1905 Value *DepArray = nullptr;
1906 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1907 Builder.SetInsertPoint(
1908 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1909
1910 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1911 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1912
1913 Builder.restoreIP(OldIP);
1914
1915 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1916 Value *Base =
1917 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1918 // Store the pointer to the variable
1919 Value *Addr = Builder.CreateStructGEP(
1920 DependInfo, Base,
1921 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1922 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1923 Builder.CreateStore(DepValPtr, Addr);
1924 // Store the size of the variable
1925 Value *Size = Builder.CreateStructGEP(
1926 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1927 Builder.CreateStore(
1928 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1929 Size);
1930 // Store the dependency kind
1931 Value *Flags = Builder.CreateStructGEP(
1932 DependInfo, Base,
1933 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1934 Builder.CreateStore(
1935 ConstantInt::get(Builder.getInt8Ty(),
1936 static_cast<unsigned int>(Dep.DepKind)),
1937 Flags);
1938 }
1939 return DepArray;
1940}
1941
1942OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1943 const LocationDescription &Loc, InsertPointTy AllocaIP,
1944 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1945 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1946 Value *Priority) {
1947
1948 if (!updateToLocation(Loc))
1949 return InsertPointTy();
1950
1951 uint32_t SrcLocStrSize;
1952 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1953 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1954 // The current basic block is split into four basic blocks. After outlining,
1955 // they will be mapped as follows:
1956 // ```
1957 // def current_fn() {
1958 // current_basic_block:
1959 // br label %task.exit
1960 // task.exit:
1961 // ; instructions after task
1962 // }
1963 // def outlined_fn() {
1964 // task.alloca:
1965 // br label %task.body
1966 // task.body:
1967 // ret void
1968 // }
1969 // ```
1970 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1971 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1972 BasicBlock *TaskAllocaBB =
1973 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1974
1975 InsertPointTy TaskAllocaIP =
1976 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1977 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1978 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1979 return Err;
1980
1981 OutlineInfo OI;
1982 OI.EntryBB = TaskAllocaBB;
1983 OI.OuterAllocaBB = AllocaIP.getBlock();
1984 OI.ExitBB = TaskExitBB;
1985
1986 // Add the thread ID argument.
1988 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1989 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1990
1991 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1992 Mergeable, Priority, EventHandle, TaskAllocaBB,
1993 ToBeDeleted](Function &OutlinedFn) mutable {
1994 // Replace the Stale CI by appropriate RTL function call.
1995 assert(OutlinedFn.hasOneUse() &&
1996 "there must be a single user for the outlined function");
1997 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1998
1999 // HasShareds is true if any variables are captured in the outlined region,
2000 // false otherwise.
2001 bool HasShareds = StaleCI->arg_size() > 1;
2002 Builder.SetInsertPoint(StaleCI);
2003
2004 // Gather the arguments for emitting the runtime call for
2005 // @__kmpc_omp_task_alloc
2006 Function *TaskAllocFn =
2007 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2008
2009 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2010 // call.
2011 Value *ThreadID = getOrCreateThreadID(Ident);
2012
2013 // Argument - `flags`
2014 // Task is tied iff (Flags & 1) == 1.
2015 // Task is untied iff (Flags & 1) == 0.
2016 // Task is final iff (Flags & 2) == 2.
2017 // Task is not final iff (Flags & 2) == 0.
2018 // Task is mergeable iff (Flags & 4) == 4.
2019 // Task is not mergeable iff (Flags & 4) == 0.
2020 // Task is priority iff (Flags & 32) == 32.
2021 // Task is not priority iff (Flags & 32) == 0.
2022 // TODO: Handle the other flags.
2023 Value *Flags = Builder.getInt32(Tied);
2024 if (Final) {
2025 Value *FinalFlag =
2026 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2027 Flags = Builder.CreateOr(FinalFlag, Flags);
2028 }
2029
2030 if (Mergeable)
2031 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2032 if (Priority)
2033 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2034
2035 // Argument - `sizeof_kmp_task_t` (TaskSize)
2036 // Tasksize refers to the size in bytes of kmp_task_t data structure
2037 // including private vars accessed in task.
2038 // TODO: add kmp_task_t_with_privates (privates)
2039 Value *TaskSize = Builder.getInt64(
2040 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2041
2042 // Argument - `sizeof_shareds` (SharedsSize)
2043 // SharedsSize refers to the shareds array size in the kmp_task_t data
2044 // structure.
2045 Value *SharedsSize = Builder.getInt64(0);
2046 if (HasShareds) {
2047 AllocaInst *ArgStructAlloca =
2049 assert(ArgStructAlloca &&
2050 "Unable to find the alloca instruction corresponding to arguments "
2051 "for extracted function");
2052 StructType *ArgStructType =
2053 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2054 assert(ArgStructType && "Unable to find struct type corresponding to "
2055 "arguments for extracted function");
2056 SharedsSize =
2057 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2058 }
2059 // Emit the @__kmpc_omp_task_alloc runtime call
2060 // The runtime call returns a pointer to an area where the task captured
2061 // variables must be copied before the task is run (TaskData)
2062 CallInst *TaskData = Builder.CreateCall(
2063 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2064 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2065 /*task_func=*/&OutlinedFn});
2066
2067 // Emit detach clause initialization.
2068 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2069 // task_descriptor);
2070 if (EventHandle) {
2071 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2072 OMPRTL___kmpc_task_allow_completion_event);
2073 llvm::Value *EventVal =
2074 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2075 llvm::Value *EventHandleAddr =
2076 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2077 Builder.getPtrTy(0));
2078 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2079 Builder.CreateStore(EventVal, EventHandleAddr);
2080 }
2081 // Copy the arguments for outlined function
2082 if (HasShareds) {
2083 Value *Shareds = StaleCI->getArgOperand(1);
2084 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2085 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2086 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2087 SharedsSize);
2088 }
2089
2090 if (Priority) {
2091 //
2092 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2093 // we populate the priority information into the "kmp_task_t" here
2094 //
2095 // The struct "kmp_task_t" definition is available in kmp.h
2096 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2097 // data2 is used for priority
2098 //
2099 Type *Int32Ty = Builder.getInt32Ty();
2100 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2101 // kmp_task_t* => { ptr }
2102 Type *TaskPtr = StructType::get(VoidPtr);
2103 Value *TaskGEP =
2104 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2105 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2106 Type *TaskStructType = StructType::get(
2107 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2108 Value *PriorityData = Builder.CreateInBoundsGEP(
2109 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2110 // kmp_cmplrdata_t => { ptr, ptr }
2111 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2112 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2113 PriorityData, {Zero, Zero});
2114 Builder.CreateStore(Priority, CmplrData);
2115 }
2116
2117 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2118
2119 // In the presence of the `if` clause, the following IR is generated:
2120 // ...
2121 // %data = call @__kmpc_omp_task_alloc(...)
2122 // br i1 %if_condition, label %then, label %else
2123 // then:
2124 // call @__kmpc_omp_task(...)
2125 // br label %exit
2126 // else:
2127 // ;; Wait for resolution of dependencies, if any, before
2128 // ;; beginning the task
2129 // call @__kmpc_omp_wait_deps(...)
2130 // call @__kmpc_omp_task_begin_if0(...)
2131 // call @outlined_fn(...)
2132 // call @__kmpc_omp_task_complete_if0(...)
2133 // br label %exit
2134 // exit:
2135 // ...
2136 if (IfCondition) {
2137 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2138 // terminator.
2139 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2140 Instruction *IfTerminator =
2141 Builder.GetInsertPoint()->getParent()->getTerminator();
2142 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2143 Builder.SetInsertPoint(IfTerminator);
2144 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2145 &ElseTI);
2146 Builder.SetInsertPoint(ElseTI);
2147
2148 if (Dependencies.size()) {
2149 Function *TaskWaitFn =
2150 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2151 Builder.CreateCall(
2152 TaskWaitFn,
2153 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2154 ConstantInt::get(Builder.getInt32Ty(), 0),
2156 }
2157 Function *TaskBeginFn =
2158 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2159 Function *TaskCompleteFn =
2160 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2161 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2162 CallInst *CI = nullptr;
2163 if (HasShareds)
2164 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2165 else
2166 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2167 CI->setDebugLoc(StaleCI->getDebugLoc());
2168 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2169 Builder.SetInsertPoint(ThenTI);
2170 }
2171
2172 if (Dependencies.size()) {
2173 Function *TaskFn =
2174 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2175 Builder.CreateCall(
2176 TaskFn,
2177 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2178 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2180
2181 } else {
2182 // Emit the @__kmpc_omp_task runtime call to spawn the task
2183 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2184 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2185 }
2186
2187 StaleCI->eraseFromParent();
2188
2189 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2190 if (HasShareds) {
2191 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2192 OutlinedFn.getArg(1)->replaceUsesWithIf(
2193 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2194 }
2195
2196 for (Instruction *I : llvm::reverse(ToBeDeleted))
2197 I->eraseFromParent();
2198 };
2199
2200 addOutlineInfo(std::move(OI));
2201 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2202
2203 return Builder.saveIP();
2204}
2205
2206OpenMPIRBuilder::InsertPointOrErrorTy
2207OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2208 InsertPointTy AllocaIP,
2209 BodyGenCallbackTy BodyGenCB) {
2210 if (!updateToLocation(Loc))
2211 return InsertPointTy();
2212
2213 uint32_t SrcLocStrSize;
2214 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2215 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2216 Value *ThreadID = getOrCreateThreadID(Ident);
2217
2218 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2219 Function *TaskgroupFn =
2220 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2221 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2222
2223 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2224 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2225 return Err;
2226
2227 Builder.SetInsertPoint(TaskgroupExitBB);
2228 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2229 Function *EndTaskgroupFn =
2230 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2231 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2232
2233 return Builder.saveIP();
2234}
2235
2236OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2237 const LocationDescription &Loc, InsertPointTy AllocaIP,
2238 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2239 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2240 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2241
2242 if (!updateToLocation(Loc))
2243 return Loc.IP;
2244
2245 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2246 // this has not been created yet at some times when this callback runs.
2247 SmallVector<BranchInst *> CancellationBranches;
2248 auto FiniCBWrapper = [&](InsertPointTy IP) {
2249 if (IP.getBlock()->end() != IP.getPoint())
2250 return FiniCB(IP);
2251 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2252 // will fail because that function requires the Finalization Basic Block to
2253 // have a terminator, which is already removed by EmitOMPRegionBody.
2254 // IP is currently at cancelation block.
2255 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2256 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2257 CancellationBranches.push_back(DummyBranch);
2258 return FiniCB(IP);
2259 };
2260
2261 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2262
2263 // Each section is emitted as a switch case
2264 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2265 // -> OMP.createSection() which generates the IR for each section
2266 // Iterate through all sections and emit a switch construct:
2267 // switch (IV) {
2268 // case 0:
2269 // <SectionStmt[0]>;
2270 // break;
2271 // ...
2272 // case <NumSection> - 1:
2273 // <SectionStmt[<NumSection> - 1]>;
2274 // break;
2275 // }
2276 // ...
2277 // section_loop.after:
2278 // <FiniCB>;
2279 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2280 Builder.restoreIP(CodeGenIP);
2282 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2283 Function *CurFn = Continue->getParent();
2284 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2285
2286 unsigned CaseNumber = 0;
2287 for (auto SectionCB : SectionCBs) {
2289 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2290 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2291 Builder.SetInsertPoint(CaseBB);
2292 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2293 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2294 CaseEndBr->getIterator()}))
2295 return Err;
2296 CaseNumber++;
2297 }
2298 // remove the existing terminator from body BB since there can be no
2299 // terminators after switch/case
2300 return Error::success();
2301 };
2302 // Loop body ends here
2303 // LowerBound, UpperBound, and STride for createCanonicalLoop
2304 Type *I32Ty = Type::getInt32Ty(M.getContext());
2305 Value *LB = ConstantInt::get(I32Ty, 0);
2306 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2307 Value *ST = ConstantInt::get(I32Ty, 1);
2308 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2309 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2310 if (!LoopInfo)
2311 return LoopInfo.takeError();
2312
2313 InsertPointOrErrorTy WsloopIP =
2314 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2315 WorksharingLoopType::ForStaticLoop, !IsNowait);
2316 if (!WsloopIP)
2317 return WsloopIP.takeError();
2318 InsertPointTy AfterIP = *WsloopIP;
2319
2320 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2321 assert(LoopFini && "Bad structure of static workshare loop finalization");
2322
2323 // Apply the finalization callback in LoopAfterBB
2324 auto FiniInfo = FinalizationStack.pop_back_val();
2325 assert(FiniInfo.DK == OMPD_sections &&
2326 "Unexpected finalization stack state!");
2327 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2328 Builder.restoreIP(AfterIP);
2329 BasicBlock *FiniBB =
2330 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2331 if (Error Err = CB(Builder.saveIP()))
2332 return Err;
2333 AfterIP = {FiniBB, FiniBB->begin()};
2334 }
2335
2336 // Now we can fix the dummy branch to point to the right place
2337 for (BranchInst *DummyBranch : CancellationBranches) {
2338 assert(DummyBranch->getNumSuccessors() == 1);
2339 DummyBranch->setSuccessor(0, LoopFini);
2340 }
2341
2342 return AfterIP;
2343}
2344
2345OpenMPIRBuilder::InsertPointOrErrorTy
2346OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2347 BodyGenCallbackTy BodyGenCB,
2348 FinalizeCallbackTy FiniCB) {
2349 if (!updateToLocation(Loc))
2350 return Loc.IP;
2351
2352 auto FiniCBWrapper = [&](InsertPointTy IP) {
2353 if (IP.getBlock()->end() != IP.getPoint())
2354 return FiniCB(IP);
2355 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2356 // will fail because that function requires the Finalization Basic Block to
2357 // have a terminator, which is already removed by EmitOMPRegionBody.
2358 // IP is currently at cancelation block.
2359 // We need to backtrack to the condition block to fetch
2360 // the exit block and create a branch from cancelation
2361 // to exit block.
2362 IRBuilder<>::InsertPointGuard IPG(Builder);
2363 Builder.restoreIP(IP);
2364 auto *CaseBB = Loc.IP.getBlock();
2365 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2366 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2367 Instruction *I = Builder.CreateBr(ExitBB);
2368 IP = InsertPointTy(I->getParent(), I->getIterator());
2369 return FiniCB(IP);
2370 };
2371
2372 Directive OMPD = Directive::OMPD_sections;
2373 // Since we are using Finalization Callback here, HasFinalize
2374 // and IsCancellable have to be true
2375 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2376 /*Conditional*/ false, /*hasFinalize*/ true,
2377 /*IsCancellable*/ true);
2378}
2379
2380static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2382 IT++;
2383 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2384}
2385
2386Value *OpenMPIRBuilder::getGPUThreadID() {
2387 return Builder.CreateCall(
2388 getOrCreateRuntimeFunction(M,
2389 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2390 {});
2391}
2392
2393Value *OpenMPIRBuilder::getGPUWarpSize() {
2394 return Builder.CreateCall(
2395 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2396}
2397
2398Value *OpenMPIRBuilder::getNVPTXWarpID() {
2399 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2400 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2401}
2402
2403Value *OpenMPIRBuilder::getNVPTXLaneID() {
2404 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2405 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2406 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2407 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2408 "nvptx_lane_id");
2409}
2410
2411Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2412 Type *ToType) {
2413 Type *FromType = From->getType();
2414 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2415 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2416 assert(FromSize > 0 && "From size must be greater than zero");
2417 assert(ToSize > 0 && "To size must be greater than zero");
2418 if (FromType == ToType)
2419 return From;
2420 if (FromSize == ToSize)
2421 return Builder.CreateBitCast(From, ToType);
2422 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2423 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2424 InsertPointTy SaveIP = Builder.saveIP();
2425 Builder.restoreIP(AllocaIP);
2426 Value *CastItem = Builder.CreateAlloca(ToType);
2427 Builder.restoreIP(SaveIP);
2428
2429 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2430 CastItem, Builder.getPtrTy(0));
2431 Builder.CreateStore(From, ValCastItem);
2432 return Builder.CreateLoad(ToType, CastItem);
2433}
2434
2435Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2436 Value *Element,
2437 Type *ElementType,
2438 Value *Offset) {
2439 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2440 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2441
2442 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2443 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2444 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2445 Value *WarpSize =
2446 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2447 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2448 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2449 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2450 Value *WarpSizeCast =
2451 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2452 Value *ShuffleCall =
2453 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2454 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2455}
2456
2457void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2458 Value *DstAddr, Type *ElemType,
2459 Value *Offset, Type *ReductionArrayTy) {
2460 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2461 // Create the loop over the big sized data.
2462 // ptr = (void*)Elem;
2463 // ptrEnd = (void*) Elem + 1;
2464 // Step = 8;
2465 // while (ptr + Step < ptrEnd)
2466 // shuffle((int64_t)*ptr);
2467 // Step = 4;
2468 // while (ptr + Step < ptrEnd)
2469 // shuffle((int32_t)*ptr);
2470 // ...
2471 Type *IndexTy = Builder.getIndexTy(
2472 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2473 Value *ElemPtr = DstAddr;
2474 Value *Ptr = SrcAddr;
2475 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2476 if (Size < IntSize)
2477 continue;
2478 Type *IntType = Builder.getIntNTy(IntSize * 8);
2479 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2480 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2481 Value *SrcAddrGEP =
2482 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2483 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2484 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2485
2486 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2487 if ((Size / IntSize) > 1) {
2488 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2489 SrcAddrGEP, Builder.getPtrTy());
2490 BasicBlock *PreCondBB =
2491 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2492 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2493 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2494 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2495 emitBlock(PreCondBB, CurFunc);
2496 PHINode *PhiSrc =
2497 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2498 PhiSrc->addIncoming(Ptr, CurrentBB);
2499 PHINode *PhiDest =
2500 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2501 PhiDest->addIncoming(ElemPtr, CurrentBB);
2502 Ptr = PhiSrc;
2503 ElemPtr = PhiDest;
2504 Value *PtrDiff = Builder.CreatePtrDiff(
2505 Builder.getInt8Ty(), PtrEnd,
2506 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2507 Builder.CreateCondBr(
2508 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2509 ExitBB);
2510 emitBlock(ThenBB, CurFunc);
2511 Value *Res = createRuntimeShuffleFunction(
2512 AllocaIP,
2513 Builder.CreateAlignedLoad(
2514 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2515 IntType, Offset);
2516 Builder.CreateAlignedStore(Res, ElemPtr,
2517 M.getDataLayout().getPrefTypeAlign(ElemType));
2518 Value *LocalPtr =
2519 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2520 Value *LocalElemPtr =
2521 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2522 PhiSrc->addIncoming(LocalPtr, ThenBB);
2523 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2524 emitBranch(PreCondBB);
2525 emitBlock(ExitBB, CurFunc);
2526 } else {
2527 Value *Res = createRuntimeShuffleFunction(
2528 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2529 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2530 Res->getType()->getScalarSizeInBits())
2531 Res = Builder.CreateTrunc(Res, ElemType);
2532 Builder.CreateStore(Res, ElemPtr);
2533 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2534 ElemPtr =
2535 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2536 }
2537 Size = Size % IntSize;
2538 }
2539}
2540
2541void OpenMPIRBuilder::emitReductionListCopy(
2542 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2543 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2544 CopyOptionsTy CopyOptions) {
2545 Type *IndexTy = Builder.getIndexTy(
2546 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2547 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2548
2549 // Iterates, element-by-element, through the source Reduce list and
2550 // make a copy.
2551 for (auto En : enumerate(ReductionInfos)) {
2552 const ReductionInfo &RI = En.value();
2553 Value *SrcElementAddr = nullptr;
2554 Value *DestElementAddr = nullptr;
2555 Value *DestElementPtrAddr = nullptr;
2556 // Should we shuffle in an element from a remote lane?
2557 bool ShuffleInElement = false;
2558 // Set to true to update the pointer in the dest Reduce list to a
2559 // newly created element.
2560 bool UpdateDestListPtr = false;
2561
2562 // Step 1.1: Get the address for the src element in the Reduce list.
2563 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2564 ReductionArrayTy, SrcBase,
2565 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2566 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2567
2568 // Step 1.2: Create a temporary to store the element in the destination
2569 // Reduce list.
2570 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2571 ReductionArrayTy, DestBase,
2572 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2573 switch (Action) {
2574 case CopyAction::RemoteLaneToThread: {
2575 InsertPointTy CurIP = Builder.saveIP();
2576 Builder.restoreIP(AllocaIP);
2577 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2578 ".omp.reduction.element");
2579 DestAlloca->setAlignment(
2580 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2581 DestElementAddr = DestAlloca;
2582 DestElementAddr =
2583 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2584 DestElementAddr->getName() + ".ascast");
2585 Builder.restoreIP(CurIP);
2586 ShuffleInElement = true;
2587 UpdateDestListPtr = true;
2588 break;
2589 }
2590 case CopyAction::ThreadCopy: {
2591 DestElementAddr =
2592 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2593 break;
2594 }
2595 }
2596
2597 // Now that all active lanes have read the element in the
2598 // Reduce list, shuffle over the value from the remote lane.
2599 if (ShuffleInElement) {
2600 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2601 RemoteLaneOffset, ReductionArrayTy);
2602 } else {
2603 switch (RI.EvaluationKind) {
2604 case EvalKind::Scalar: {
2605 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2606 // Store the source element value to the dest element address.
2607 Builder.CreateStore(Elem, DestElementAddr);
2608 break;
2609 }
2610 case EvalKind::Complex: {
2611 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2612 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2613 Value *SrcReal = Builder.CreateLoad(
2614 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2615 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2616 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2617 Value *SrcImg = Builder.CreateLoad(
2618 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2619
2620 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2621 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2622 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2623 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2624 Builder.CreateStore(SrcReal, DestRealPtr);
2625 Builder.CreateStore(SrcImg, DestImgPtr);
2626 break;
2627 }
2628 case EvalKind::Aggregate: {
2629 Value *SizeVal = Builder.getInt64(
2630 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2631 Builder.CreateMemCpy(
2632 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2633 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2634 SizeVal, false);
2635 break;
2636 }
2637 };
2638 }
2639
2640 // Step 3.1: Modify reference in dest Reduce list as needed.
2641 // Modifying the reference in Reduce list to point to the newly
2642 // created element. The element is live in the current function
2643 // scope and that of functions it invokes (i.e., reduce_function).
2644 // RemoteReduceData[i] = (void*)&RemoteElem
2645 if (UpdateDestListPtr) {
2646 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2647 DestElementAddr, Builder.getPtrTy(),
2648 DestElementAddr->getName() + ".ascast");
2649 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2650 }
2651 }
2652}
2653
2654Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2655 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2656 AttributeList FuncAttrs) {
2657 InsertPointTy SavedIP = Builder.saveIP();
2658 LLVMContext &Ctx = M.getContext();
2660 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2661 /* IsVarArg */ false);
2662 Function *WcFunc =
2664 "_omp_reduction_inter_warp_copy_func", &M);
2665 WcFunc->setAttributes(FuncAttrs);
2666 WcFunc->addParamAttr(0, Attribute::NoUndef);
2667 WcFunc->addParamAttr(1, Attribute::NoUndef);
2668 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2669 Builder.SetInsertPoint(EntryBB);
2670
2671 // ReduceList: thread local Reduce list.
2672 // At the stage of the computation when this function is called, partially
2673 // aggregated values reside in the first lane of every active warp.
2674 Argument *ReduceListArg = WcFunc->getArg(0);
2675 // NumWarps: number of warps active in the parallel region. This could
2676 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2677 Argument *NumWarpsArg = WcFunc->getArg(1);
2678
2679 // This array is used as a medium to transfer, one reduce element at a time,
2680 // the data from the first lane of every warp to lanes in the first warp
2681 // in order to perform the final step of a reduction in a parallel region
2682 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2683 // for reduced latency, as well as to have a distinct copy for concurrently
2684 // executing target regions. The array is declared with common linkage so
2685 // as to be shared across compilation units.
2686 StringRef TransferMediumName =
2687 "__openmp_nvptx_data_transfer_temporary_storage";
2688 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2689 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2690 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2691 if (!TransferMedium) {
2692 TransferMedium = new GlobalVariable(
2693 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2694 UndefValue::get(ArrayTy), TransferMediumName,
2695 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2696 /*AddressSpace=*/3);
2697 }
2698
2699 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2700 Value *GPUThreadID = getGPUThreadID();
2701 // nvptx_lane_id = nvptx_id % warpsize
2702 Value *LaneID = getNVPTXLaneID();
2703 // nvptx_warp_id = nvptx_id / warpsize
2704 Value *WarpID = getNVPTXWarpID();
2705
2706 InsertPointTy AllocaIP =
2707 InsertPointTy(Builder.GetInsertBlock(),
2708 Builder.GetInsertBlock()->getFirstInsertionPt());
2709 Type *Arg0Type = ReduceListArg->getType();
2710 Type *Arg1Type = NumWarpsArg->getType();
2711 Builder.restoreIP(AllocaIP);
2712 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2713 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2714 AllocaInst *NumWarpsAlloca =
2715 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2716 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2717 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2718 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2719 NumWarpsAlloca, Builder.getPtrTy(0),
2720 NumWarpsAlloca->getName() + ".ascast");
2721 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2722 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2723 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2724 InsertPointTy CodeGenIP =
2725 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2726 Builder.restoreIP(CodeGenIP);
2727
2728 Value *ReduceList =
2729 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2730
2731 for (auto En : enumerate(ReductionInfos)) {
2732 //
2733 // Warp master copies reduce element to transfer medium in __shared__
2734 // memory.
2735 //
2736 const ReductionInfo &RI = En.value();
2737 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2738 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2739 Type *CType = Builder.getIntNTy(TySize * 8);
2740
2741 unsigned NumIters = RealTySize / TySize;
2742 if (NumIters == 0)
2743 continue;
2744 Value *Cnt = nullptr;
2745 Value *CntAddr = nullptr;
2746 BasicBlock *PrecondBB = nullptr;
2747 BasicBlock *ExitBB = nullptr;
2748 if (NumIters > 1) {
2749 CodeGenIP = Builder.saveIP();
2750 Builder.restoreIP(AllocaIP);
2751 CntAddr =
2752 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2753
2754 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2755 CntAddr->getName() + ".ascast");
2756 Builder.restoreIP(CodeGenIP);
2757 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2758 CntAddr,
2759 /*Volatile=*/false);
2760 PrecondBB = BasicBlock::Create(Ctx, "precond");
2761 ExitBB = BasicBlock::Create(Ctx, "exit");
2762 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2763 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2764 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2765 /*Volatile=*/false);
2766 Value *Cmp = Builder.CreateICmpULT(
2767 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2768 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2769 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2770 }
2771
2772 // kmpc_barrier.
2773 InsertPointOrErrorTy BarrierIP1 =
2774 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2775 omp::Directive::OMPD_unknown,
2776 /* ForceSimpleCall */ false,
2777 /* CheckCancelFlag */ true);
2778 if (!BarrierIP1)
2779 return BarrierIP1.takeError();
2780 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2781 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2782 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2783
2784 // if (lane_id == 0)
2785 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2786 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2787 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2788
2789 // Reduce element = LocalReduceList[i]
2790 auto *RedListArrayTy =
2791 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2792 Type *IndexTy = Builder.getIndexTy(
2793 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2794 Value *ElemPtrPtr =
2795 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2796 {ConstantInt::get(IndexTy, 0),
2797 ConstantInt::get(IndexTy, En.index())});
2798 // elemptr = ((CopyType*)(elemptrptr)) + I
2799 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2800 if (NumIters > 1)
2801 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2802
2803 // Get pointer to location in transfer medium.
2804 // MediumPtr = &medium[warp_id]
2805 Value *MediumPtr = Builder.CreateInBoundsGEP(
2806 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2807 // elem = *elemptr
2808 //*MediumPtr = elem
2809 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2810 // Store the source element value to the dest element address.
2811 Builder.CreateStore(Elem, MediumPtr,
2812 /*IsVolatile*/ true);
2813 Builder.CreateBr(MergeBB);
2814
2815 // else
2816 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2817 Builder.CreateBr(MergeBB);
2818
2819 // endif
2820 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2821 InsertPointOrErrorTy BarrierIP2 =
2822 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2823 omp::Directive::OMPD_unknown,
2824 /* ForceSimpleCall */ false,
2825 /* CheckCancelFlag */ true);
2826 if (!BarrierIP2)
2827 return BarrierIP2.takeError();
2828
2829 // Warp 0 copies reduce element from transfer medium
2830 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2831 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2832 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2833
2834 Value *NumWarpsVal =
2835 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2836 // Up to 32 threads in warp 0 are active.
2837 Value *IsActiveThread =
2838 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2839 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2840
2841 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2842
2843 // SecMediumPtr = &medium[tid]
2844 // SrcMediumVal = *SrcMediumPtr
2845 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2846 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2847 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2848 Value *TargetElemPtrPtr =
2849 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2850 {ConstantInt::get(IndexTy, 0),
2851 ConstantInt::get(IndexTy, En.index())});
2852 Value *TargetElemPtrVal =
2853 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2854 Value *TargetElemPtr = TargetElemPtrVal;
2855 if (NumIters > 1)
2856 TargetElemPtr =
2857 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2858
2859 // *TargetElemPtr = SrcMediumVal;
2860 Value *SrcMediumValue =
2861 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2862 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2863 Builder.CreateBr(W0MergeBB);
2864
2865 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2866 Builder.CreateBr(W0MergeBB);
2867
2868 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2869
2870 if (NumIters > 1) {
2871 Cnt = Builder.CreateNSWAdd(
2872 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2873 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2874
2875 auto *CurFn = Builder.GetInsertBlock()->getParent();
2876 emitBranch(PrecondBB);
2877 emitBlock(ExitBB, CurFn);
2878 }
2879 RealTySize %= TySize;
2880 }
2881 }
2882
2883 Builder.CreateRetVoid();
2884 Builder.restoreIP(SavedIP);
2885
2886 return WcFunc;
2887}
2888
2889Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2890 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2891 AttributeList FuncAttrs) {
2892 LLVMContext &Ctx = M.getContext();
2893 FunctionType *FuncTy =
2894 FunctionType::get(Builder.getVoidTy(),
2895 {Builder.getPtrTy(), Builder.getInt16Ty(),
2896 Builder.getInt16Ty(), Builder.getInt16Ty()},
2897 /* IsVarArg */ false);
2898 Function *SarFunc =
2900 "_omp_reduction_shuffle_and_reduce_func", &M);
2901 SarFunc->setAttributes(FuncAttrs);
2902 SarFunc->addParamAttr(0, Attribute::NoUndef);
2903 SarFunc->addParamAttr(1, Attribute::NoUndef);
2904 SarFunc->addParamAttr(2, Attribute::NoUndef);
2905 SarFunc->addParamAttr(3, Attribute::NoUndef);
2906 SarFunc->addParamAttr(1, Attribute::SExt);
2907 SarFunc->addParamAttr(2, Attribute::SExt);
2908 SarFunc->addParamAttr(3, Attribute::SExt);
2909 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2910 Builder.SetInsertPoint(EntryBB);
2911
2912 // Thread local Reduce list used to host the values of data to be reduced.
2913 Argument *ReduceListArg = SarFunc->getArg(0);
2914 // Current lane id; could be logical.
2915 Argument *LaneIDArg = SarFunc->getArg(1);
2916 // Offset of the remote source lane relative to the current lane.
2917 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2918 // Algorithm version. This is expected to be known at compile time.
2919 Argument *AlgoVerArg = SarFunc->getArg(3);
2920
2921 Type *ReduceListArgType = ReduceListArg->getType();
2922 Type *LaneIDArgType = LaneIDArg->getType();
2923 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2924 Value *ReduceListAlloca = Builder.CreateAlloca(
2925 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2926 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2927 LaneIDArg->getName() + ".addr");
2928 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2929 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2930 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2931 AlgoVerArg->getName() + ".addr");
2932 ArrayType *RedListArrayTy =
2933 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2934
2935 // Create a local thread-private variable to host the Reduce list
2936 // from a remote lane.
2937 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2938 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2939
2940 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2941 ReduceListAlloca, ReduceListArgType,
2942 ReduceListAlloca->getName() + ".ascast");
2943 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2944 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2945 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2946 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2947 RemoteLaneOffsetAlloca->getName() + ".ascast");
2948 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2949 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2950 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2951 RemoteReductionListAlloca, Builder.getPtrTy(),
2952 RemoteReductionListAlloca->getName() + ".ascast");
2953
2954 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2955 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2956 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2957 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2958
2959 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2960 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2961 Value *RemoteLaneOffset =
2962 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2963 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2964
2965 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2966
2967 // This loop iterates through the list of reduce elements and copies,
2968 // element by element, from a remote lane in the warp to RemoteReduceList,
2969 // hosted on the thread's stack.
2970 emitReductionListCopy(
2971 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2972 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2973
2974 // The actions to be performed on the Remote Reduce list is dependent
2975 // on the algorithm version.
2976 //
2977 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2978 // LaneId % 2 == 0 && Offset > 0):
2979 // do the reduction value aggregation
2980 //
2981 // The thread local variable Reduce list is mutated in place to host the
2982 // reduced data, which is the aggregated value produced from local and
2983 // remote lanes.
2984 //
2985 // Note that AlgoVer is expected to be a constant integer known at compile
2986 // time.
2987 // When AlgoVer==0, the first conjunction evaluates to true, making
2988 // the entire predicate true during compile time.
2989 // When AlgoVer==1, the second conjunction has only the second part to be
2990 // evaluated during runtime. Other conjunctions evaluates to false
2991 // during compile time.
2992 // When AlgoVer==2, the third conjunction has only the second part to be
2993 // evaluated during runtime. Other conjunctions evaluates to false
2994 // during compile time.
2995 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2996 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2997 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2998 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2999 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3000 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3001 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3002 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3003 Value *RemoteOffsetComp =
3004 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3005 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3006 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3007 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3008
3009 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3010 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3011 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3012
3013 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3014 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3015 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3016 ReduceList, Builder.getPtrTy());
3017 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3018 RemoteListAddrCast, Builder.getPtrTy());
3019 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3020 ->addFnAttr(Attribute::NoUnwind);
3021 Builder.CreateBr(MergeBB);
3022
3023 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3024 Builder.CreateBr(MergeBB);
3025
3026 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3027
3028 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3029 // Reduce list.
3030 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3031 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3032 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3033
3034 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3035 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3036 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3037 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3038
3039 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3040 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3041 ReductionInfos, RemoteListAddrCast, ReduceList);
3042 Builder.CreateBr(CpyMergeBB);
3043
3044 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3045 Builder.CreateBr(CpyMergeBB);
3046
3047 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3048
3049 Builder.CreateRetVoid();
3050
3051 return SarFunc;
3052}
3053
3054Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3055 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3056 AttributeList FuncAttrs) {
3057 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3058 LLVMContext &Ctx = M.getContext();
3060 Builder.getVoidTy(),
3061 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3062 /* IsVarArg */ false);
3063 Function *LtGCFunc =
3065 "_omp_reduction_list_to_global_copy_func", &M);
3066 LtGCFunc->setAttributes(FuncAttrs);
3067 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3068 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3069 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3070
3071 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3072 Builder.SetInsertPoint(EntryBlock);
3073
3074 // Buffer: global reduction buffer.
3075 Argument *BufferArg = LtGCFunc->getArg(0);
3076 // Idx: index of the buffer.
3077 Argument *IdxArg = LtGCFunc->getArg(1);
3078 // ReduceList: thread local Reduce list.
3079 Argument *ReduceListArg = LtGCFunc->getArg(2);
3080
3081 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3082 BufferArg->getName() + ".addr");
3083 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3084 IdxArg->getName() + ".addr");
3085 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3086 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3087 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3088 BufferArgAlloca, Builder.getPtrTy(),
3089 BufferArgAlloca->getName() + ".ascast");
3090 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3091 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3092 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3093 ReduceListArgAlloca, Builder.getPtrTy(),
3094 ReduceListArgAlloca->getName() + ".ascast");
3095
3096 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3097 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3098 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3099
3100 Value *LocalReduceList =
3101 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3102 Value *BufferArgVal =
3103 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3104 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3105 Type *IndexTy = Builder.getIndexTy(
3106 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3107 for (auto En : enumerate(ReductionInfos)) {
3108 const ReductionInfo &RI = En.value();
3109 auto *RedListArrayTy =
3110 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3111 // Reduce element = LocalReduceList[i]
3112 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3113 RedListArrayTy, LocalReduceList,
3114 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3115 // elemptr = ((CopyType*)(elemptrptr)) + I
3116 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3117
3118 // Global = Buffer.VD[Idx];
3119 Value *BufferVD =
3120 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3121 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3122 ReductionsBufferTy, BufferVD, 0, En.index());
3123
3124 switch (RI.EvaluationKind) {
3125 case EvalKind::Scalar: {
3126 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3127 Builder.CreateStore(TargetElement, GlobVal);
3128 break;
3129 }
3130 case EvalKind::Complex: {
3131 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3132 RI.ElementType, ElemPtr, 0, 0, ".realp");
3133 Value *SrcReal = Builder.CreateLoad(
3134 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3135 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3136 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3137 Value *SrcImg = Builder.CreateLoad(
3138 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3139
3140 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3141 RI.ElementType, GlobVal, 0, 0, ".realp");
3142 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3143 RI.ElementType, GlobVal, 0, 1, ".imagp");
3144 Builder.CreateStore(SrcReal, DestRealPtr);
3145 Builder.CreateStore(SrcImg, DestImgPtr);
3146 break;
3147 }
3148 case EvalKind::Aggregate: {
3149 Value *SizeVal =
3150 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3151 Builder.CreateMemCpy(
3152 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3153 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3154 break;
3155 }
3156 }
3157 }
3158
3159 Builder.CreateRetVoid();
3160 Builder.restoreIP(OldIP);
3161 return LtGCFunc;
3162}
3163
3164Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3165 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3166 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3167 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3168 LLVMContext &Ctx = M.getContext();
3170 Builder.getVoidTy(),
3171 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3172 /* IsVarArg */ false);
3173 Function *LtGRFunc =
3175 "_omp_reduction_list_to_global_reduce_func", &M);
3176 LtGRFunc->setAttributes(FuncAttrs);
3177 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3178 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3179 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3180
3181 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3182 Builder.SetInsertPoint(EntryBlock);
3183
3184 // Buffer: global reduction buffer.
3185 Argument *BufferArg = LtGRFunc->getArg(0);
3186 // Idx: index of the buffer.
3187 Argument *IdxArg = LtGRFunc->getArg(1);
3188 // ReduceList: thread local Reduce list.
3189 Argument *ReduceListArg = LtGRFunc->getArg(2);
3190
3191 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3192 BufferArg->getName() + ".addr");
3193 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3194 IdxArg->getName() + ".addr");
3195 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3196 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3197 auto *RedListArrayTy =
3198 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3199
3200 // 1. Build a list of reduction variables.
3201 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3202 Value *LocalReduceList =
3203 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3204
3205 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3206 BufferArgAlloca, Builder.getPtrTy(),
3207 BufferArgAlloca->getName() + ".ascast");
3208 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3209 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3210 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3211 ReduceListArgAlloca, Builder.getPtrTy(),
3212 ReduceListArgAlloca->getName() + ".ascast");
3213 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3214 LocalReduceList, Builder.getPtrTy(),
3215 LocalReduceList->getName() + ".ascast");
3216
3217 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3218 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3219 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3220
3221 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3222 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3223 Type *IndexTy = Builder.getIndexTy(
3224 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3225 for (auto En : enumerate(ReductionInfos)) {
3226 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3227 RedListArrayTy, LocalReduceListAddrCast,
3228 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3229 Value *BufferVD =
3230 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3231 // Global = Buffer.VD[Idx];
3232 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3233 ReductionsBufferTy, BufferVD, 0, En.index());
3234 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3235 }
3236
3237 // Call reduce_function(GlobalReduceList, ReduceList)
3238 Value *ReduceList =
3239 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3240 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3241 ->addFnAttr(Attribute::NoUnwind);
3242 Builder.CreateRetVoid();
3243 Builder.restoreIP(OldIP);
3244 return LtGRFunc;
3245}
3246
3247Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3248 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3249 AttributeList FuncAttrs) {
3250 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3251 LLVMContext &Ctx = M.getContext();
3253 Builder.getVoidTy(),
3254 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3255 /* IsVarArg */ false);
3256 Function *LtGCFunc =
3258 "_omp_reduction_global_to_list_copy_func", &M);
3259 LtGCFunc->setAttributes(FuncAttrs);
3260 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3261 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3262 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3263
3264 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3265 Builder.SetInsertPoint(EntryBlock);
3266
3267 // Buffer: global reduction buffer.
3268 Argument *BufferArg = LtGCFunc->getArg(0);
3269 // Idx: index of the buffer.
3270 Argument *IdxArg = LtGCFunc->getArg(1);
3271 // ReduceList: thread local Reduce list.
3272 Argument *ReduceListArg = LtGCFunc->getArg(2);
3273
3274 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3275 BufferArg->getName() + ".addr");
3276 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3277 IdxArg->getName() + ".addr");
3278 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3279 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3280 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3281 BufferArgAlloca, Builder.getPtrTy(),
3282 BufferArgAlloca->getName() + ".ascast");
3283 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3284 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3285 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3286 ReduceListArgAlloca, Builder.getPtrTy(),
3287 ReduceListArgAlloca->getName() + ".ascast");
3288 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3289 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3290 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3291
3292 Value *LocalReduceList =
3293 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3294 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3295 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3296 Type *IndexTy = Builder.getIndexTy(
3297 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3298 for (auto En : enumerate(ReductionInfos)) {
3299 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3300 auto *RedListArrayTy =
3301 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3302 // Reduce element = LocalReduceList[i]
3303 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3304 RedListArrayTy, LocalReduceList,
3305 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3306 // elemptr = ((CopyType*)(elemptrptr)) + I
3307 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3308 // Global = Buffer.VD[Idx];
3309 Value *BufferVD =
3310 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3311 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3312 ReductionsBufferTy, BufferVD, 0, En.index());
3313
3314 switch (RI.EvaluationKind) {
3315 case EvalKind::Scalar: {
3316 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3317 Builder.CreateStore(TargetElement, ElemPtr);
3318 break;
3319 }
3320 case EvalKind::Complex: {
3321 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3322 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3323 Value *SrcReal = Builder.CreateLoad(
3324 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3325 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3326 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3327 Value *SrcImg = Builder.CreateLoad(
3328 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3329
3330 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3331 RI.ElementType, ElemPtr, 0, 0, ".realp");
3332 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3333 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3334 Builder.CreateStore(SrcReal, DestRealPtr);
3335 Builder.CreateStore(SrcImg, DestImgPtr);
3336 break;
3337 }
3338 case EvalKind::Aggregate: {
3339 Value *SizeVal =
3340 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3341 Builder.CreateMemCpy(
3342 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3343 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3344 SizeVal, false);
3345 break;
3346 }
3347 }
3348 }
3349
3350 Builder.CreateRetVoid();
3351 Builder.restoreIP(OldIP);
3352 return LtGCFunc;
3353}
3354
3355Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3356 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3357 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3358 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3359 LLVMContext &Ctx = M.getContext();
3360 auto *FuncTy = FunctionType::get(
3361 Builder.getVoidTy(),
3362 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3363 /* IsVarArg */ false);
3364 Function *LtGRFunc =
3366 "_omp_reduction_global_to_list_reduce_func", &M);
3367 LtGRFunc->setAttributes(FuncAttrs);
3368 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3369 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3370 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3371
3372 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3373 Builder.SetInsertPoint(EntryBlock);
3374
3375 // Buffer: global reduction buffer.
3376 Argument *BufferArg = LtGRFunc->getArg(0);
3377 // Idx: index of the buffer.
3378 Argument *IdxArg = LtGRFunc->getArg(1);
3379 // ReduceList: thread local Reduce list.
3380 Argument *ReduceListArg = LtGRFunc->getArg(2);
3381
3382 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3383 BufferArg->getName() + ".addr");
3384 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3385 IdxArg->getName() + ".addr");
3386 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3387 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3388 ArrayType *RedListArrayTy =
3389 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3390
3391 // 1. Build a list of reduction variables.
3392 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3393 Value *LocalReduceList =
3394 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3395
3396 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3397 BufferArgAlloca, Builder.getPtrTy(),
3398 BufferArgAlloca->getName() + ".ascast");
3399 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3400 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3401 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3402 ReduceListArgAlloca, Builder.getPtrTy(),
3403 ReduceListArgAlloca->getName() + ".ascast");
3404 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3405 LocalReduceList, Builder.getPtrTy(),
3406 LocalReduceList->getName() + ".ascast");
3407
3408 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3409 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3410 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3411
3412 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3413 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3414 Type *IndexTy = Builder.getIndexTy(
3415 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3416 for (auto En : enumerate(ReductionInfos)) {
3417 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3418 RedListArrayTy, ReductionList,
3419 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3420 // Global = Buffer.VD[Idx];
3421 Value *BufferVD =
3422 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3423 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3424 ReductionsBufferTy, BufferVD, 0, En.index());
3425 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3426 }
3427
3428 // Call reduce_function(ReduceList, GlobalReduceList)
3429 Value *ReduceList =
3430 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3431 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3432 ->addFnAttr(Attribute::NoUnwind);
3433 Builder.CreateRetVoid();
3434 Builder.restoreIP(OldIP);
3435 return LtGRFunc;
3436}
3437
3438std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3439 std::string Suffix =
3440 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3441 return (Name + Suffix).str();
3442}
3443
3444Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3445 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3446 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3447 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3448 {Builder.getPtrTy(), Builder.getPtrTy()},
3449 /* IsVarArg */ false);
3450 std::string Name = getReductionFuncName(ReducerName);
3451 Function *ReductionFunc =
3453 ReductionFunc->setAttributes(FuncAttrs);
3454 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3455 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3456 BasicBlock *EntryBB =
3457 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3458 Builder.SetInsertPoint(EntryBB);
3459
3460 // Need to alloca memory here and deal with the pointers before getting
3461 // LHS/RHS pointers out
3462 Value *LHSArrayPtr = nullptr;
3463 Value *RHSArrayPtr = nullptr;
3464 Argument *Arg0 = ReductionFunc->getArg(0);
3465 Argument *Arg1 = ReductionFunc->getArg(1);
3466 Type *Arg0Type = Arg0->getType();
3467 Type *Arg1Type = Arg1->getType();
3468
3469 Value *LHSAlloca =
3470 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3471 Value *RHSAlloca =
3472 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3473 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3474 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3475 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3476 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3477 Builder.CreateStore(Arg0, LHSAddrCast);
3478 Builder.CreateStore(Arg1, RHSAddrCast);
3479 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3480 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3481
3482 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3483 Type *IndexTy = Builder.getIndexTy(
3484 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3485 SmallVector<Value *> LHSPtrs, RHSPtrs;
3486 for (auto En : enumerate(ReductionInfos)) {
3487 const ReductionInfo &RI = En.value();
3488 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3489 RedArrayTy, RHSArrayPtr,
3490 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3491 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3492 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3493 RHSI8Ptr, RI.PrivateVariable->getType(),
3494 RHSI8Ptr->getName() + ".ascast");
3495
3496 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3497 RedArrayTy, LHSArrayPtr,
3498 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3499 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3500 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3501 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3502
3503 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3504 LHSPtrs.emplace_back(LHSPtr);
3505 RHSPtrs.emplace_back(RHSPtr);
3506 } else {
3507 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3508 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3509 Value *Reduced;
3510 InsertPointOrErrorTy AfterIP =
3511 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3512 if (!AfterIP)
3513 return AfterIP.takeError();
3514 if (!Builder.GetInsertBlock())
3515 return ReductionFunc;
3516
3517 Builder.restoreIP(*AfterIP);
3518 Builder.CreateStore(Reduced, LHSPtr);
3519 }
3520 }
3521
3522 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3523 for (auto En : enumerate(ReductionInfos)) {
3524 unsigned Index = En.index();
3525 const ReductionInfo &RI = En.value();
3526 Value *LHSFixupPtr, *RHSFixupPtr;
3527 Builder.restoreIP(RI.ReductionGenClang(
3528 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3529
3530 // Fix the CallBack code genereated to use the correct Values for the LHS
3531 // and RHS
3532 LHSFixupPtr->replaceUsesWithIf(
3533 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3534 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3535 ReductionFunc;
3536 });
3537 RHSFixupPtr->replaceUsesWithIf(
3538 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3539 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3540 ReductionFunc;
3541 });
3542 }
3543
3544 Builder.CreateRetVoid();
3545 return ReductionFunc;
3546}
3547
3548static void
3550 bool IsGPU) {
3551 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3552 (void)RI;
3553 assert(RI.Variable && "expected non-null variable");
3554 assert(RI.PrivateVariable && "expected non-null private variable");
3555 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3556 "expected non-null reduction generator callback");
3557 if (!IsGPU) {
3558 assert(
3559 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3560 "expected variables and their private equivalents to have the same "
3561 "type");
3562 }
3563 assert(RI.Variable->getType()->isPointerTy() &&
3564 "expected variables to be pointers");
3565 }
3566}
3567
3568OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3569 const LocationDescription &Loc, InsertPointTy AllocaIP,
3570 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3571 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3572 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3573 Value *SrcLocInfo) {
3574 if (!updateToLocation(Loc))
3575 return InsertPointTy();
3576 Builder.restoreIP(CodeGenIP);
3577 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3578 LLVMContext &Ctx = M.getContext();
3579
3580 // Source location for the ident struct
3581 if (!SrcLocInfo) {
3582 uint32_t SrcLocStrSize;
3583 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3584 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3585 }
3586
3587 if (ReductionInfos.size() == 0)
3588 return Builder.saveIP();
3589
3590 BasicBlock *ContinuationBlock = nullptr;
3591 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3592 // Copied code from createReductions
3593 BasicBlock *InsertBlock = Loc.IP.getBlock();
3594 ContinuationBlock =
3595 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3596 InsertBlock->getTerminator()->eraseFromParent();
3597 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3598 }
3599
3600 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3601 AttributeList FuncAttrs;
3602 AttrBuilder AttrBldr(Ctx);
3603 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3604 AttrBldr.addAttribute(Attr);
3605 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3606 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3607
3608 CodeGenIP = Builder.saveIP();
3609 Expected<Function *> ReductionResult =
3610 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3611 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3612 if (!ReductionResult)
3613 return ReductionResult.takeError();
3614 Function *ReductionFunc = *ReductionResult;
3615 Builder.restoreIP(CodeGenIP);
3616
3617 // Set the grid value in the config needed for lowering later on
3618 if (GridValue.has_value())
3619 Config.setGridValue(GridValue.value());
3620 else
3621 Config.setGridValue(getGridValue(T, ReductionFunc));
3622
3623 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3624 // RedList, shuffle_reduce_func, interwarp_copy_func);
3625 // or
3626 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3627 Value *Res;
3628
3629 // 1. Build a list of reduction variables.
3630 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3631 auto Size = ReductionInfos.size();
3632 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
3633 Type *FuncPtrTy =
3634 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
3635 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3636 CodeGenIP = Builder.saveIP();
3637 Builder.restoreIP(AllocaIP);
3638 Value *ReductionListAlloca =
3639 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3640 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3641 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3642 Builder.restoreIP(CodeGenIP);
3643 Type *IndexTy = Builder.getIndexTy(
3644 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3645 for (auto En : enumerate(ReductionInfos)) {
3646 const ReductionInfo &RI = En.value();
3647 Value *ElemPtr = Builder.CreateInBoundsGEP(
3648 RedArrayTy, ReductionList,
3649 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3650 Value *CastElem =
3651 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3652 Builder.CreateStore(CastElem, ElemPtr);
3653 }
3654 CodeGenIP = Builder.saveIP();
3655 Function *SarFunc =
3656 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3657 Expected<Function *> CopyResult =
3658 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3659 if (!CopyResult)
3660 return CopyResult.takeError();
3661 Function *WcFunc = *CopyResult;
3662 Builder.restoreIP(CodeGenIP);
3663
3664 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3665
3666 unsigned MaxDataSize = 0;
3667 SmallVector<Type *> ReductionTypeArgs;
3668 for (auto En : enumerate(ReductionInfos)) {
3669 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3670 if (Size > MaxDataSize)
3671 MaxDataSize = Size;
3672 ReductionTypeArgs.emplace_back(En.value().ElementType);
3673 }
3674 Value *ReductionDataSize =
3675 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3676 if (!IsTeamsReduction) {
3677 Value *SarFuncCast =
3678 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy);
3679 Value *WcFuncCast =
3680 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
3681 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3682 WcFuncCast};
3683 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3684 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3685 Res = Builder.CreateCall(Pv2Ptr, Args);
3686 } else {
3687 CodeGenIP = Builder.saveIP();
3688 StructType *ReductionsBufferTy = StructType::create(
3689 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3690 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3691 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3692 Function *LtGCFunc = emitListToGlobalCopyFunction(
3693 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3694 Function *LtGRFunc = emitListToGlobalReduceFunction(
3695 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3696 Function *GtLCFunc = emitGlobalToListCopyFunction(
3697 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3698 Function *GtLRFunc = emitGlobalToListReduceFunction(
3699 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3700 Builder.restoreIP(CodeGenIP);
3701
3702 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3703 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3704
3705 Value *Args3[] = {SrcLocInfo,
3706 KernelTeamsReductionPtr,
3707 Builder.getInt32(ReductionBufNum),
3708 ReductionDataSize,
3709 RL,
3710 SarFunc,
3711 WcFunc,
3712 LtGCFunc,
3713 LtGRFunc,
3714 GtLCFunc,
3715 GtLRFunc};
3716
3717 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3718 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3719 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3720 }
3721
3722 // 5. Build if (res == 1)
3723 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3724 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3725 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3726 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3727
3728 // 6. Build then branch: where we have reduced values in the master
3729 // thread in each team.
3730 // __kmpc_end_reduce{_nowait}(<gtid>);
3731 // break;
3732 emitBlock(ThenBB, CurFunc);
3733
3734 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3735 for (auto En : enumerate(ReductionInfos)) {
3736 const ReductionInfo &RI = En.value();
3737 Value *LHS = RI.Variable;
3738 Value *RHS =
3739 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3740
3741 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3742 Value *LHSPtr, *RHSPtr;
3743 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3744 &LHSPtr, &RHSPtr, CurFunc));
3745
3746 // Fix the CallBack code genereated to use the correct Values for the LHS
3747 // and RHS
3748 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3749 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3750 ReductionFunc;
3751 });
3752 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3753 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3754 ReductionFunc;
3755 });
3756 } else {
3757 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3758 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3759 Value *Reduced;
3760 InsertPointOrErrorTy AfterIP =
3761 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3762 if (!AfterIP)
3763 return AfterIP.takeError();
3764 Builder.restoreIP(*AfterIP);
3765 Builder.CreateStore(Reduced, LHS, false);
3766 }
3767 }
3768 emitBlock(ExitBB, CurFunc);
3769 if (ContinuationBlock) {
3770 Builder.CreateBr(ContinuationBlock);
3771 Builder.SetInsertPoint(ContinuationBlock);
3772 }
3773 Config.setEmitLLVMUsed();
3774
3775 return Builder.saveIP();
3776}
3777
3779 Type *VoidTy = Type::getVoidTy(M.getContext());
3780 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3781 auto *FuncTy =
3782 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3784 ".omp.reduction.func", &M);
3785}
3786
3788 Function *ReductionFunc,
3790 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3791 Module *Module = ReductionFunc->getParent();
3792 BasicBlock *ReductionFuncBlock =
3793 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3794 Builder.SetInsertPoint(ReductionFuncBlock);
3795 Value *LHSArrayPtr = nullptr;
3796 Value *RHSArrayPtr = nullptr;
3797 if (IsGPU) {
3798 // Need to alloca memory here and deal with the pointers before getting
3799 // LHS/RHS pointers out
3800 //
3801 Argument *Arg0 = ReductionFunc->getArg(0);
3802 Argument *Arg1 = ReductionFunc->getArg(1);
3803 Type *Arg0Type = Arg0->getType();
3804 Type *Arg1Type = Arg1->getType();
3805
3806 Value *LHSAlloca =
3807 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3808 Value *RHSAlloca =
3809 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3810 Value *LHSAddrCast =
3811 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3812 Value *RHSAddrCast =
3813 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3814 Builder.CreateStore(Arg0, LHSAddrCast);
3815 Builder.CreateStore(Arg1, RHSAddrCast);
3816 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3817 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3818 } else {
3819 LHSArrayPtr = ReductionFunc->getArg(0);
3820 RHSArrayPtr = ReductionFunc->getArg(1);
3821 }
3822
3823 unsigned NumReductions = ReductionInfos.size();
3824 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3825
3826 for (auto En : enumerate(ReductionInfos)) {
3827 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3828 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3829 RedArrayTy, LHSArrayPtr, 0, En.index());
3830 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3831 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3832 LHSI8Ptr, RI.Variable->getType());
3833 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3834 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3835 RedArrayTy, RHSArrayPtr, 0, En.index());
3836 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3837 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3838 RHSI8Ptr, RI.PrivateVariable->getType());
3839 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3840 Value *Reduced;
3841 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3842 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3843 if (!AfterIP)
3844 return AfterIP.takeError();
3845
3846 Builder.restoreIP(*AfterIP);
3847 // TODO: Consider flagging an error.
3848 if (!Builder.GetInsertBlock())
3849 return Error::success();
3850
3851 // store is inside of the reduction region when using by-ref
3852 if (!IsByRef[En.index()])
3853 Builder.CreateStore(Reduced, LHSPtr);
3854 }
3855 Builder.CreateRetVoid();
3856 return Error::success();
3857}
3858
3859OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3860 const LocationDescription &Loc, InsertPointTy AllocaIP,
3861 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3862 bool IsNoWait, bool IsTeamsReduction) {
3863 assert(ReductionInfos.size() == IsByRef.size());
3864 if (Config.isGPU())
3865 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3866 IsNoWait, IsTeamsReduction);
3867
3868 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3869
3870 if (!updateToLocation(Loc))
3871 return InsertPointTy();
3872
3873 if (ReductionInfos.size() == 0)
3874 return Builder.saveIP();
3875
3876 BasicBlock *InsertBlock = Loc.IP.getBlock();
3877 BasicBlock *ContinuationBlock =
3878 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3879 InsertBlock->getTerminator()->eraseFromParent();
3880
3881 // Create and populate array of type-erased pointers to private reduction
3882 // values.
3883 unsigned NumReductions = ReductionInfos.size();
3884 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3885 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3886 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3887
3888 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3889
3890 for (auto En : enumerate(ReductionInfos)) {
3891 unsigned Index = En.index();
3892 const ReductionInfo &RI = En.value();
3893 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3894 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3895 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3896 }
3897
3898 // Emit a call to the runtime function that orchestrates the reduction.
3899 // Declare the reduction function in the process.
3900 Type *IndexTy = Builder.getIndexTy(
3901 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3902 Function *Func = Builder.GetInsertBlock()->getParent();
3903 Module *Module = Func->getParent();
3904 uint32_t SrcLocStrSize;
3905 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3906 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3907 return RI.AtomicReductionGen;
3908 });
3909 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3910 CanGenerateAtomic
3911 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3912 : IdentFlag(0));
3913 Value *ThreadId = getOrCreateThreadID(Ident);
3914 Constant *NumVariables = Builder.getInt32(NumReductions);
3915 const DataLayout &DL = Module->getDataLayout();
3916 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3917 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3918 Function *ReductionFunc = getFreshReductionFunc(*Module);
3919 Value *Lock = getOMPCriticalRegionLock(".reduction");
3920 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3921 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3922 : RuntimeFunction::OMPRTL___kmpc_reduce);
3923 CallInst *ReduceCall =
3924 Builder.CreateCall(ReduceFunc,
3925 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3926 ReductionFunc, Lock},
3927 "reduce");
3928
3929 // Create final reduction entry blocks for the atomic and non-atomic case.
3930 // Emit IR that dispatches control flow to one of the blocks based on the
3931 // reduction supporting the atomic mode.
3932 BasicBlock *NonAtomicRedBlock =
3933 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3934 BasicBlock *AtomicRedBlock =
3935 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3937 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3938 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3939 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3940
3941 // Populate the non-atomic reduction using the elementwise reduction function.
3942 // This loads the elements from the global and private variables and reduces
3943 // them before storing back the result to the global variable.
3944 Builder.SetInsertPoint(NonAtomicRedBlock);
3945 for (auto En : enumerate(ReductionInfos)) {
3946 const ReductionInfo &RI = En.value();
3947 Type *ValueType = RI.ElementType;
3948 // We have one less load for by-ref case because that load is now inside of
3949 // the reduction region
3950 Value *RedValue = RI.Variable;
3951 if (!IsByRef[En.index()]) {
3952 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3953 "red.value." + Twine(En.index()));
3954 }
3955 Value *PrivateRedValue =
3956 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3957 "red.private.value." + Twine(En.index()));
3958 Value *Reduced;
3959 InsertPointOrErrorTy AfterIP =
3960 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3961 if (!AfterIP)
3962 return AfterIP.takeError();
3963 Builder.restoreIP(*AfterIP);
3964
3965 if (!Builder.GetInsertBlock())
3966 return InsertPointTy();
3967 // for by-ref case, the load is inside of the reduction region
3968 if (!IsByRef[En.index()])
3969 Builder.CreateStore(Reduced, RI.Variable);
3970 }
3971 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3972 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3973 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3974 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3975 Builder.CreateBr(ContinuationBlock);
3976
3977 // Populate the atomic reduction using the atomic elementwise reduction
3978 // function. There are no loads/stores here because they will be happening
3979 // inside the atomic elementwise reduction.
3980 Builder.SetInsertPoint(AtomicRedBlock);
3981 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3982 for (const ReductionInfo &RI : ReductionInfos) {
3983 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3984 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3985 if (!AfterIP)
3986 return AfterIP.takeError();
3987 Builder.restoreIP(*AfterIP);
3988 if (!Builder.GetInsertBlock())
3989 return InsertPointTy();
3990 }
3991 Builder.CreateBr(ContinuationBlock);
3992 } else {
3993 Builder.CreateUnreachable();
3994 }
3995
3996 // Populate the outlined reduction function using the elementwise reduction
3997 // function. Partial values are extracted from the type-erased array of
3998 // pointers to private variables.
3999 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4000 IsByRef, /*isGPU=*/false);
4001 if (Err)
4002 return Err;
4003
4004 if (!Builder.GetInsertBlock())
4005 return InsertPointTy();
4006
4007 Builder.SetInsertPoint(ContinuationBlock);
4008 return Builder.saveIP();
4009}
4010
4011OpenMPIRBuilder::InsertPointOrErrorTy
4012OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4013 BodyGenCallbackTy BodyGenCB,
4014 FinalizeCallbackTy FiniCB) {
4015 if (!updateToLocation(Loc))
4016 return Loc.IP;
4017
4018 Directive OMPD = Directive::OMPD_master;
4019 uint32_t SrcLocStrSize;
4020 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4021 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4022 Value *ThreadId = getOrCreateThreadID(Ident);
4023 Value *Args[] = {Ident, ThreadId};
4024
4025 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4026 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4027
4028 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4029 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4030
4031 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4032 /*Conditional*/ true, /*hasFinalize*/ true);
4033}
4034
4035OpenMPIRBuilder::InsertPointOrErrorTy
4036OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4037 BodyGenCallbackTy BodyGenCB,
4038 FinalizeCallbackTy FiniCB, Value *Filter) {
4039 if (!updateToLocation(Loc))
4040 return Loc.IP;
4041
4042 Directive OMPD = Directive::OMPD_masked;
4043 uint32_t SrcLocStrSize;
4044 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4045 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4046 Value *ThreadId = getOrCreateThreadID(Ident);
4047 Value *Args[] = {Ident, ThreadId, Filter};
4048 Value *ArgsEnd[] = {Ident, ThreadId};
4049
4050 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4051 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4052
4053 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4054 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4055
4056 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4057 /*Conditional*/ true, /*hasFinalize*/ true);
4058}
4059
4061 llvm::FunctionCallee Callee,
4063 const llvm::Twine &Name) {
4064 llvm::CallInst *Call = Builder.CreateCall(
4065 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4066 Call->setDoesNotThrow();
4067 return Call;
4068}
4069
4070// Expects input basic block is dominated by BeforeScanBB.
4071// Once Scan directive is encountered, the code after scan directive should be
4072// dominated by AfterScanBB. Scan directive splits the code sequence to
4073// scan and input phase. Based on whether inclusive or exclusive
4074// clause is used in the scan directive and whether input loop or scan loop
4075// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4076// input loop and second is the scan loop. The code generated handles only
4077// inclusive scans now.
4078OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4079 const LocationDescription &Loc, InsertPointTy AllocaIP,
4080 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4081 bool IsInclusive, ScanInfo *ScanRedInfo) {
4082 if (ScanRedInfo->OMPFirstScanLoop) {
4083 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4084 ScanVarsType, ScanRedInfo);
4085 if (Err)
4086 return Err;
4087 }
4088 if (!updateToLocation(Loc))
4089 return Loc.IP;
4090
4091 llvm::Value *IV = ScanRedInfo->IV;
4092
4093 if (ScanRedInfo->OMPFirstScanLoop) {
4094 // Emit buffer[i] = red; at the end of the input phase.
4095 for (size_t i = 0; i < ScanVars.size(); i++) {
4096 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4097 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4098 Type *DestTy = ScanVarsType[i];
4099 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4100 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4101
4102 Builder.CreateStore(Src, Val);
4103 }
4104 }
4105 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4106 emitBlock(ScanRedInfo->OMPScanDispatch,
4107 Builder.GetInsertBlock()->getParent());
4108
4109 if (!ScanRedInfo->OMPFirstScanLoop) {
4110 IV = ScanRedInfo->IV;
4111 // Emit red = buffer[i]; at the entrance to the scan phase.
4112 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4113 for (size_t i = 0; i < ScanVars.size(); i++) {
4114 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4115 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4116 Type *DestTy = ScanVarsType[i];
4117 Value *SrcPtr =
4118 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4119 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4120 Builder.CreateStore(Src, ScanVars[i]);
4121 }
4122 }
4123
4124 // TODO: Update it to CreateBr and remove dead blocks
4125 llvm::Value *CmpI = Builder.getInt1(true);
4126 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4127 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4128 ScanRedInfo->OMPAfterScanBlock);
4129 } else {
4130 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4131 ScanRedInfo->OMPBeforeScanBlock);
4132 }
4133 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4134 Builder.GetInsertBlock()->getParent());
4135 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4136 return Builder.saveIP();
4137}
4138
4139Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4140 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4141 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4142
4143 Builder.restoreIP(AllocaIP);
4144 // Create the shared pointer at alloca IP.
4145 for (size_t i = 0; i < ScanVars.size(); i++) {
4146 llvm::Value *BuffPtr =
4147 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4148 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4149 }
4150
4151 // Allocate temporary buffer by master thread
4152 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4153 InsertPointTy CodeGenIP) -> Error {
4154 Builder.restoreIP(CodeGenIP);
4155 Value *AllocSpan =
4156 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4157 for (size_t i = 0; i < ScanVars.size(); i++) {
4158 Type *IntPtrTy = Builder.getInt32Ty();
4159 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4160 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4161 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4162 AllocSpan, nullptr, "arr");
4163 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4164 }
4165 return Error::success();
4166 };
4167 // TODO: Perform finalization actions for variables. This has to be
4168 // called for variables which have destructors/finalizers.
4169 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4170
4171 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4172 llvm::Value *FilterVal = Builder.getInt32(0);
4173 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4174 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4175
4176 if (!AfterIP)
4177 return AfterIP.takeError();
4178 Builder.restoreIP(*AfterIP);
4179 BasicBlock *InputBB = Builder.GetInsertBlock();
4180 if (InputBB->getTerminator())
4181 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4182 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4183 if (!AfterIP)
4184 return AfterIP.takeError();
4185 Builder.restoreIP(*AfterIP);
4186
4187 return Error::success();
4188}
4189
4190Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4191 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4192 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4193 InsertPointTy CodeGenIP) -> Error {
4194 Builder.restoreIP(CodeGenIP);
4195 for (ReductionInfo RedInfo : ReductionInfos) {
4196 Value *PrivateVar = RedInfo.PrivateVariable;
4197 Value *OrigVar = RedInfo.Variable;
4198 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4199 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4200
4201 Type *SrcTy = RedInfo.ElementType;
4202 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4203 "arrayOffset");
4204 Value *Src = Builder.CreateLoad(SrcTy, Val);
4205
4206 Builder.CreateStore(Src, OrigVar);
4207 Builder.CreateFree(Buff);
4208 }
4209 return Error::success();
4210 };
4211 // TODO: Perform finalization actions for variables. This has to be
4212 // called for variables which have destructors/finalizers.
4213 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4214
4215 if (ScanRedInfo->OMPScanFinish->getTerminator())
4216 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4217 else
4218 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4219
4220 llvm::Value *FilterVal = Builder.getInt32(0);
4221 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4222 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4223
4224 if (!AfterIP)
4225 return AfterIP.takeError();
4226 Builder.restoreIP(*AfterIP);
4227 BasicBlock *InputBB = Builder.GetInsertBlock();
4228 if (InputBB->getTerminator())
4229 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4230 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4231 if (!AfterIP)
4232 return AfterIP.takeError();
4233 Builder.restoreIP(*AfterIP);
4234 return Error::success();
4235}
4236
4237OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4238 const LocationDescription &Loc,
4240 ScanInfo *ScanRedInfo) {
4241
4242 if (!updateToLocation(Loc))
4243 return Loc.IP;
4244 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4245 InsertPointTy CodeGenIP) -> Error {
4246 Builder.restoreIP(CodeGenIP);
4247 Function *CurFn = Builder.GetInsertBlock()->getParent();
4248 // for (int k = 0; k <= ceil(log2(n)); ++k)
4249 llvm::BasicBlock *LoopBB =
4250 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4251 llvm::BasicBlock *ExitBB =
4252 splitBB(Builder, false, "omp.outer.log.scan.exit");
4254 Builder.GetInsertBlock()->getModule(),
4255 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4256 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4257 llvm::Value *Arg =
4258 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4259 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4261 Builder.GetInsertBlock()->getModule(),
4262 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4263 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4264 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4265 llvm::Value *NMin1 = Builder.CreateNUWSub(
4266 ScanRedInfo->Span,
4267 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4268 Builder.SetInsertPoint(InputBB);
4269 Builder.CreateBr(LoopBB);
4270 emitBlock(LoopBB, CurFn);
4271 Builder.SetInsertPoint(LoopBB);
4272
4273 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4274 // size pow2k = 1;
4275 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4276 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4277 InputBB);
4278 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4279 InputBB);
4280 // for (size i = n - 1; i >= 2 ^ k; --i)
4281 // tmp[i] op= tmp[i-pow2k];
4282 llvm::BasicBlock *InnerLoopBB =
4283 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4284 llvm::BasicBlock *InnerExitBB =
4285 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4286 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4287 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4288 emitBlock(InnerLoopBB, CurFn);
4289 Builder.SetInsertPoint(InnerLoopBB);
4290 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4291 IVal->addIncoming(NMin1, LoopBB);
4292 for (ReductionInfo RedInfo : ReductionInfos) {
4293 Value *ReductionVal = RedInfo.PrivateVariable;
4294 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4295 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4296 Type *DestTy = RedInfo.ElementType;
4297 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4298 Value *LHSPtr =
4299 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4300 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4301 Value *RHSPtr =
4302 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4303 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4304 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4306 InsertPointOrErrorTy AfterIP =
4307 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4308 if (!AfterIP)
4309 return AfterIP.takeError();
4310 Builder.CreateStore(Result, LHSPtr);
4311 }
4312 llvm::Value *NextIVal = Builder.CreateNUWSub(
4313 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4314 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4315 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4316 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4317 emitBlock(InnerExitBB, CurFn);
4318 llvm::Value *Next = Builder.CreateNUWAdd(
4319 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4320 Counter->addIncoming(Next, Builder.GetInsertBlock());
4321 // pow2k <<= 1;
4322 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4323 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4324 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4325 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4326 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4327 return Error::success();
4328 };
4329
4330 // TODO: Perform finalization actions for variables. This has to be
4331 // called for variables which have destructors/finalizers.
4332 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4333
4334 llvm::Value *FilterVal = Builder.getInt32(0);
4335 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4336 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4337
4338 if (!AfterIP)
4339 return AfterIP.takeError();
4340 Builder.restoreIP(*AfterIP);
4341 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4342
4343 if (!AfterIP)
4344 return AfterIP.takeError();
4345 Builder.restoreIP(*AfterIP);
4346 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4347 if (Err)
4348 return Err;
4349
4350 return AfterIP;
4351}
4352
4353Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4354 llvm::function_ref<Error()> InputLoopGen,
4355 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4356 ScanInfo *ScanRedInfo) {
4357
4358 {
4359 // Emit loop with input phase:
4360 // for (i: 0..<num_iters>) {
4361 // <input phase>;
4362 // buffer[i] = red;
4363 // }
4364 ScanRedInfo->OMPFirstScanLoop = true;
4365 Error Err = InputLoopGen();
4366 if (Err)
4367 return Err;
4368 }
4369 {
4370 // Emit loop with scan phase:
4371 // for (i: 0..<num_iters>) {
4372 // red = buffer[i];
4373 // <scan phase>;
4374 // }
4375 ScanRedInfo->OMPFirstScanLoop = false;
4376 Error Err = ScanLoopGen(Builder.saveIP());
4377 if (Err)
4378 return Err;
4379 }
4380 return Error::success();
4381}
4382
4383void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4384 Function *Fun = Builder.GetInsertBlock()->getParent();
4385 ScanRedInfo->OMPScanDispatch =
4386 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4387 ScanRedInfo->OMPAfterScanBlock =
4388 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4389 ScanRedInfo->OMPBeforeScanBlock =
4390 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4391 ScanRedInfo->OMPScanLoopExit =
4392 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4393}
4394CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4395 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4396 BasicBlock *PostInsertBefore, const Twine &Name) {
4397 Module *M = F->getParent();
4398 LLVMContext &Ctx = M->getContext();
4399 Type *IndVarTy = TripCount->getType();
4400
4401 // Create the basic block structure.
4402 BasicBlock *Preheader =
4403 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4404 BasicBlock *Header =
4405 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4406 BasicBlock *Cond =
4407 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4408 BasicBlock *Body =
4409 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4410 BasicBlock *Latch =
4411 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4412 BasicBlock *Exit =
4413 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4414 BasicBlock *After =
4415 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4416
4417 // Use specified DebugLoc for new instructions.
4418 Builder.SetCurrentDebugLocation(DL);
4419
4420 Builder.SetInsertPoint(Preheader);
4421 Builder.CreateBr(Header);
4422
4423 Builder.SetInsertPoint(Header);
4424 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4425 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4426 Builder.CreateBr(Cond);
4427
4428 Builder.SetInsertPoint(Cond);
4429 Value *Cmp =
4430 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4431 Builder.CreateCondBr(Cmp, Body, Exit);
4432
4433 Builder.SetInsertPoint(Body);
4434 Builder.CreateBr(Latch);
4435
4436 Builder.SetInsertPoint(Latch);
4437 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4438 "omp_" + Name + ".next", /*HasNUW=*/true);
4439 Builder.CreateBr(Header);
4440 IndVarPHI->addIncoming(Next, Latch);
4441
4442 Builder.SetInsertPoint(Exit);
4443 Builder.CreateBr(After);
4444
4445 // Remember and return the canonical control flow.
4446 LoopInfos.emplace_front();
4447 CanonicalLoopInfo *CL = &LoopInfos.front();
4448
4449 CL->Header = Header;
4450 CL->Cond = Cond;
4451 CL->Latch = Latch;
4452 CL->Exit = Exit;
4453
4454#ifndef NDEBUG
4455 CL->assertOK();
4456#endif
4457 return CL;
4458}
4459
4461OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4462 LoopBodyGenCallbackTy BodyGenCB,
4463 Value *TripCount, const Twine &Name) {
4464 BasicBlock *BB = Loc.IP.getBlock();
4465 BasicBlock *NextBB = BB->getNextNode();
4466
4467 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4468 NextBB, NextBB, Name);
4469 BasicBlock *After = CL->getAfter();
4470
4471 // If location is not set, don't connect the loop.
4472 if (updateToLocation(Loc)) {
4473 // Split the loop at the insertion point: Branch to the preheader and move
4474 // every following instruction to after the loop (the After BB). Also, the
4475 // new successor is the loop's after block.
4476 spliceBB(Builder, After, /*CreateBranch=*/false);
4477 Builder.CreateBr(CL->getPreheader());
4478 }
4479
4480 // Emit the body content. We do it after connecting the loop to the CFG to
4481 // avoid that the callback encounters degenerate BBs.
4482 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4483 return Err;
4484
4485#ifndef NDEBUG
4486 CL->assertOK();
4487#endif
4488 return CL;
4489}
4490
4491Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4492 ScanInfos.emplace_front();
4493 ScanInfo *Result = &ScanInfos.front();
4494 return Result;
4495}
4496
4498OpenMPIRBuilder::createCanonicalScanLoops(
4499 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4500 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4501 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4502 LocationDescription ComputeLoc =
4503 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4504 updateToLocation(ComputeLoc);
4505
4507
4508 Value *TripCount = calculateCanonicalLoopTripCount(
4509 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4510 ScanRedInfo->Span = TripCount;
4511 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4512 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4513
4514 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4515 Builder.restoreIP(CodeGenIP);
4516 ScanRedInfo->IV = IV;
4517 createScanBBs(ScanRedInfo);
4518 BasicBlock *InputBlock = Builder.GetInsertBlock();
4519 Instruction *Terminator = InputBlock->getTerminator();
4520 assert(Terminator->getNumSuccessors() == 1);
4521 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4522 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4523 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4524 Builder.GetInsertBlock()->getParent());
4525 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4526 emitBlock(ScanRedInfo->OMPScanLoopExit,
4527 Builder.GetInsertBlock()->getParent());
4528 Builder.CreateBr(ContinueBlock);
4529 Builder.SetInsertPoint(
4530 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4531 return BodyGenCB(Builder.saveIP(), IV);
4532 };
4533
4534 const auto &&InputLoopGen = [&]() -> Error {
4535 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4536 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4537 ComputeIP, Name, true, ScanRedInfo);
4538 if (!LoopInfo)
4539 return LoopInfo.takeError();
4540 Result.push_back(*LoopInfo);
4541 Builder.restoreIP((*LoopInfo)->getAfterIP());
4542 return Error::success();
4543 };
4544 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4546 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4547 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4548 if (!LoopInfo)
4549 return LoopInfo.takeError();
4550 Result.push_back(*LoopInfo);
4551 Builder.restoreIP((*LoopInfo)->getAfterIP());
4552 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4553 return Error::success();
4554 };
4555 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4556 if (Err)
4557 return Err;
4558 return Result;
4559}
4560
4561Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4562 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4563 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4564
4565 // Consider the following difficulties (assuming 8-bit signed integers):
4566 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4567 // DO I = 1, 100, 50
4568 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4569 // DO I = 100, 0, -128
4570
4571 // Start, Stop and Step must be of the same integer type.
4572 auto *IndVarTy = cast<IntegerType>(Start->getType());
4573 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4574 assert(IndVarTy == Step->getType() && "Step type mismatch");
4575
4576 updateToLocation(Loc);
4577
4578 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4579 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4580
4581 // Like Step, but always positive.
4582 Value *Incr = Step;
4583
4584 // Distance between Start and Stop; always positive.
4585 Value *Span;
4586
4587 // Condition whether there are no iterations are executed at all, e.g. because
4588 // UB < LB.
4589 Value *ZeroCmp;
4590
4591 if (IsSigned) {
4592 // Ensure that increment is positive. If not, negate and invert LB and UB.
4593 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4594 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4595 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4596 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4597 Span = Builder.CreateSub(UB, LB, "", false, true);
4598 ZeroCmp = Builder.CreateICmp(
4599 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4600 } else {
4601 Span = Builder.CreateSub(Stop, Start, "", true);
4602 ZeroCmp = Builder.CreateICmp(
4603 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4604 }
4605
4606 Value *CountIfLooping;
4607 if (InclusiveStop) {
4608 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4609 } else {
4610 // Avoid incrementing past stop since it could overflow.
4611 Value *CountIfTwo = Builder.CreateAdd(
4612 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4613 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4614 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4615 }
4616
4617 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4618 "omp_" + Name + ".tripcount");
4619}
4620
4621Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4622 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4623 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4624 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4625 ScanInfo *ScanRedInfo) {
4626 LocationDescription ComputeLoc =
4627 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4628
4629 Value *TripCount = calculateCanonicalLoopTripCount(
4630 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4631
4632 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4633 Builder.restoreIP(CodeGenIP);
4634 Value *Span = Builder.CreateMul(IV, Step);
4635 Value *IndVar = Builder.CreateAdd(Span, Start);
4636 if (InScan)
4637 ScanRedInfo->IV = IndVar;
4638 return BodyGenCB(Builder.saveIP(), IndVar);
4639 };
4640 LocationDescription LoopLoc =
4641 ComputeIP.isSet()
4642 ? Loc
4643 : LocationDescription(Builder.saveIP(),
4644 Builder.getCurrentDebugLocation());
4645 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4646}
4647
4648// Returns an LLVM function to call for initializing loop bounds using OpenMP
4649// static scheduling for composite `distribute parallel for` depending on
4650// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4651// integers as unsigned similarly to CanonicalLoopInfo.
4652static FunctionCallee
4654 OpenMPIRBuilder &OMPBuilder) {
4655 unsigned Bitwidth = Ty->getIntegerBitWidth();
4656 if (Bitwidth == 32)
4657 return OMPBuilder.getOrCreateRuntimeFunction(
4658 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4659 if (Bitwidth == 64)
4660 return OMPBuilder.getOrCreateRuntimeFunction(
4661 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4662 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4663}
4664
4665// Returns an LLVM function to call for initializing loop bounds using OpenMP
4666// static scheduling depending on `type`. Only i32 and i64 are supported by the
4667// runtime. Always interpret integers as unsigned similarly to
4668// CanonicalLoopInfo.
4670 OpenMPIRBuilder &OMPBuilder) {
4671 unsigned Bitwidth = Ty->getIntegerBitWidth();
4672 if (Bitwidth == 32)
4673 return OMPBuilder.getOrCreateRuntimeFunction(
4674 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4675 if (Bitwidth == 64)
4676 return OMPBuilder.getOrCreateRuntimeFunction(
4677 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4678 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4679}
4680
4681OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4682 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4683 WorksharingLoopType LoopType, bool NeedsBarrier) {
4684 assert(CLI->isValid() && "Requires a valid canonical loop");
4685 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4686 "Require dedicated allocate IP");
4687
4688 // Set up the source location value for OpenMP runtime.
4689 Builder.restoreIP(CLI->getPreheaderIP());
4690 Builder.SetCurrentDebugLocation(DL);
4691
4692 uint32_t SrcLocStrSize;
4693 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4694 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4695
4696 // Declare useful OpenMP runtime functions.
4697 Value *IV = CLI->getIndVar();
4698 Type *IVTy = IV->getType();
4699 FunctionCallee StaticInit =
4700 LoopType == WorksharingLoopType::DistributeForStaticLoop
4701 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4702 : getKmpcForStaticInitForType(IVTy, M, *this);
4703 FunctionCallee StaticFini =
4704 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4705
4706 // Allocate space for computed loop bounds as expected by the "init" function.
4707 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4708
4709 Type *I32Type = Type::getInt32Ty(M.getContext());
4710 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4711 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4712 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4713 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4714 CLI->setLastIter(PLastIter);
4715
4716 // At the end of the preheader, prepare for calling the "init" function by
4717 // storing the current loop bounds into the allocated space. A canonical loop
4718 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4719 // and produces an inclusive upper bound.
4720 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4721 Constant *Zero = ConstantInt::get(IVTy, 0);
4722 Constant *One = ConstantInt::get(IVTy, 1);
4723 Builder.CreateStore(Zero, PLowerBound);
4724 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4725 Builder.CreateStore(UpperBound, PUpperBound);
4726 Builder.CreateStore(One, PStride);
4727
4728 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4729
4730 OMPScheduleType SchedType =
4731 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4732 ? OMPScheduleType::OrderedDistribute
4734 Constant *SchedulingType =
4735 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4736
4737 // Call the "init" function and update the trip count of the loop with the
4738 // value it produced.
4740 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4741 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4742 Value *PDistUpperBound =
4743 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4744 Args.push_back(PDistUpperBound);
4745 }
4746 Args.append({PStride, One, Zero});
4747 Builder.CreateCall(StaticInit, Args);
4748 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4749 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4750 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4751 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4752 CLI->setTripCount(TripCount);
4753
4754 // Update all uses of the induction variable except the one in the condition
4755 // block that compares it with the actual upper bound, and the increment in
4756 // the latch block.
4757
4758 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4759 Builder.SetInsertPoint(CLI->getBody(),
4760 CLI->getBody()->getFirstInsertionPt());
4761 Builder.SetCurrentDebugLocation(DL);
4762 return Builder.CreateAdd(OldIV, LowerBound);
4763 });
4764
4765 // In the "exit" block, call the "fini" function.
4766 Builder.SetInsertPoint(CLI->getExit(),
4767 CLI->getExit()->getTerminator()->getIterator());
4768 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4769
4770 // Add the barrier if requested.
4771 if (NeedsBarrier) {
4772 InsertPointOrErrorTy BarrierIP =
4773 createBarrier(LocationDescription(Builder.saveIP(), DL),
4774 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4775 /* CheckCancelFlag */ false);
4776 if (!BarrierIP)
4777 return BarrierIP.takeError();
4778 }
4779
4780 InsertPointTy AfterIP = CLI->getAfterIP();
4781 CLI->invalidate();
4782
4783 return AfterIP;
4784}
4785
4786OpenMPIRBuilder::InsertPointOrErrorTy
4787OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4788 CanonicalLoopInfo *CLI,
4789 InsertPointTy AllocaIP,
4790 bool NeedsBarrier,
4791 Value *ChunkSize) {
4792 assert(CLI->isValid() && "Requires a valid canonical loop");
4793 assert(ChunkSize && "Chunk size is required");
4794
4795 LLVMContext &Ctx = CLI->getFunction()->getContext();
4796 Value *IV = CLI->getIndVar();
4797 Value *OrigTripCount = CLI->getTripCount();
4798 Type *IVTy = IV->getType();
4799 assert(IVTy->getIntegerBitWidth() <= 64 &&
4800 "Max supported tripcount bitwidth is 64 bits");
4801 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4802 : Type::getInt64Ty(Ctx);
4803 Type *I32Type = Type::getInt32Ty(M.getContext());
4804 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4805 Constant *One = ConstantInt::get(InternalIVTy, 1);
4806
4807 // Declare useful OpenMP runtime functions.
4808 FunctionCallee StaticInit =
4809 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4810 FunctionCallee StaticFini =
4811 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4812
4813 // Allocate space for computed loop bounds as expected by the "init" function.
4814 Builder.restoreIP(AllocaIP);
4815 Builder.SetCurrentDebugLocation(DL);
4816 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4817 Value *PLowerBound =
4818 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4819 Value *PUpperBound =
4820 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4821 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4822 CLI->setLastIter(PLastIter);
4823
4824 // Set up the source location value for the OpenMP runtime.
4825 Builder.restoreIP(CLI->getPreheaderIP());
4826 Builder.SetCurrentDebugLocation(DL);
4827
4828 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4829 Value *CastedChunkSize =
4830 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4831 Value *CastedTripCount =
4832 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4833
4834 Constant *SchedulingType = ConstantInt::get(
4835 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4836 Builder.CreateStore(Zero, PLowerBound);
4837 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4838 Builder.CreateStore(OrigUpperBound, PUpperBound);
4839 Builder.CreateStore(One, PStride);
4840
4841 // Call the "init" function and update the trip count of the loop with the
4842 // value it produced.
4843 uint32_t SrcLocStrSize;
4844 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4845 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4846 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4847 Builder.CreateCall(StaticInit,
4848 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4849 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4850 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4851 /*pstride=*/PStride, /*incr=*/One,
4852 /*chunk=*/CastedChunkSize});
4853
4854 // Load values written by the "init" function.
4855 Value *FirstChunkStart =
4856 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4857 Value *FirstChunkStop =
4858 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4859 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4860 Value *ChunkRange =
4861 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4862 Value *NextChunkStride =
4863 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4864
4865 // Create outer "dispatch" loop for enumerating the chunks.
4866 BasicBlock *DispatchEnter = splitBB(Builder, true);
4867 Value *DispatchCounter;
4868
4869 // It is safe to assume this didn't return an error because the callback
4870 // passed into createCanonicalLoop is the only possible error source, and it
4871 // always returns success.
4872 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4873 {Builder.saveIP(), DL},
4874 [&](InsertPointTy BodyIP, Value *Counter) {
4875 DispatchCounter = Counter;
4876 return Error::success();
4877 },
4878 FirstChunkStart, CastedTripCount, NextChunkStride,
4879 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4880 "dispatch"));
4881
4882 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4883 // not have to preserve the canonical invariant.
4884 BasicBlock *DispatchBody = DispatchCLI->getBody();
4885 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4886 BasicBlock *DispatchExit = DispatchCLI->getExit();
4887 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4888 DispatchCLI->invalidate();
4889
4890 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4891 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4892 redirectTo(CLI->getExit(), DispatchLatch, DL);
4893 redirectTo(DispatchBody, DispatchEnter, DL);
4894
4895 // Prepare the prolog of the chunk loop.
4896 Builder.restoreIP(CLI->getPreheaderIP());
4897 Builder.SetCurrentDebugLocation(DL);
4898
4899 // Compute the number of iterations of the chunk loop.
4900 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4901 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4902 Value *IsLastChunk =
4903 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4904 Value *CountUntilOrigTripCount =
4905 Builder.CreateSub(CastedTripCount, DispatchCounter);
4906 Value *ChunkTripCount = Builder.CreateSelect(
4907 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4908 Value *BackcastedChunkTC =
4909 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4910 CLI->setTripCount(BackcastedChunkTC);
4911
4912 // Update all uses of the induction variable except the one in the condition
4913 // block that compares it with the actual upper bound, and the increment in
4914 // the latch block.
4915 Value *BackcastedDispatchCounter =
4916 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4917 CLI->mapIndVar([&](Instruction *) -> Value * {
4918 Builder.restoreIP(CLI->getBodyIP());
4919 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4920 });
4921
4922 // In the "exit" block, call the "fini" function.
4923 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4924 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4925
4926 // Add the barrier if requested.
4927 if (NeedsBarrier) {
4928 InsertPointOrErrorTy AfterIP =
4929 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4930 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4931 if (!AfterIP)
4932 return AfterIP.takeError();
4933 }
4934
4935#ifndef NDEBUG
4936 // Even though we currently do not support applying additional methods to it,
4937 // the chunk loop should remain a canonical loop.
4938 CLI->assertOK();
4939#endif
4940
4941 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4942}
4943
4944// Returns an LLVM function to call for executing an OpenMP static worksharing
4945// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4946// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4947static FunctionCallee
4948getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4949 WorksharingLoopType LoopType) {
4950 unsigned Bitwidth = Ty->getIntegerBitWidth();
4951 Module &M = OMPBuilder->M;
4952 switch (LoopType) {
4953 case WorksharingLoopType::ForStaticLoop:
4954 if (Bitwidth == 32)
4955 return OMPBuilder->getOrCreateRuntimeFunction(
4956 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4957 if (Bitwidth == 64)
4958 return OMPBuilder->getOrCreateRuntimeFunction(
4959 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4960 break;
4961 case WorksharingLoopType::DistributeStaticLoop:
4962 if (Bitwidth == 32)
4963 return OMPBuilder->getOrCreateRuntimeFunction(
4964 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4965 if (Bitwidth == 64)
4966 return OMPBuilder->getOrCreateRuntimeFunction(
4967 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4968 break;
4969 case WorksharingLoopType::DistributeForStaticLoop:
4970 if (Bitwidth == 32)
4971 return OMPBuilder->getOrCreateRuntimeFunction(
4972 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4973 if (Bitwidth == 64)
4974 return OMPBuilder->getOrCreateRuntimeFunction(
4975 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4976 break;
4977 }
4978 if (Bitwidth != 32 && Bitwidth != 64) {
4979 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4980 }
4981 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4982}
4983
4984// Inserts a call to proper OpenMP Device RTL function which handles
4985// loop worksharing.
4986static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4987 WorksharingLoopType LoopType,
4988 BasicBlock *InsertBlock, Value *Ident,
4989 Value *LoopBodyArg, Value *TripCount,
4990 Function &LoopBodyFn, bool NoLoop) {
4991 Type *TripCountTy = TripCount->getType();
4992 Module &M = OMPBuilder->M;
4993 IRBuilder<> &Builder = OMPBuilder->Builder;
4994 FunctionCallee RTLFn =
4995 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4996 SmallVector<Value *, 8> RealArgs;
4997 RealArgs.push_back(Ident);
4998 RealArgs.push_back(&LoopBodyFn);
4999 RealArgs.push_back(LoopBodyArg);
5000 RealArgs.push_back(TripCount);
5001 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5002 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5003 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5004 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5005 Builder.CreateCall(RTLFn, RealArgs);
5006 return;
5007 }
5008 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5009 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5010 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5011 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5012
5013 RealArgs.push_back(
5014 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5015 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5016 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5017 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5018 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5019 } else {
5020 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5021 }
5022
5023 Builder.CreateCall(RTLFn, RealArgs);
5024}
5025
5027 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5028 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5029 WorksharingLoopType LoopType, bool NoLoop) {
5030 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5031 BasicBlock *Preheader = CLI->getPreheader();
5032 Value *TripCount = CLI->getTripCount();
5033
5034 // After loop body outling, the loop body contains only set up
5035 // of loop body argument structure and the call to the outlined
5036 // loop body function. Firstly, we need to move setup of loop body args
5037 // into loop preheader.
5038 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5039 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5040
5041 // The next step is to remove the whole loop. We do not it need anymore.
5042 // That's why make an unconditional branch from loop preheader to loop
5043 // exit block
5044 Builder.restoreIP({Preheader, Preheader->end()});
5045 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5046 Preheader->getTerminator()->eraseFromParent();
5047 Builder.CreateBr(CLI->getExit());
5048
5049 // Delete dead loop blocks
5050 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5051 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5052 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5053 CleanUpInfo.EntryBB = CLI->getHeader();
5054 CleanUpInfo.ExitBB = CLI->getExit();
5055 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5056 DeleteDeadBlocks(BlocksToBeRemoved);
5057
5058 // Find the instruction which corresponds to loop body argument structure
5059 // and remove the call to loop body function instruction.
5060 Value *LoopBodyArg;
5061 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5062 assert(OutlinedFnUser &&
5063 "Expected unique undroppable user of outlined function");
5064 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5065 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5066 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5067 "Expected outlined function call to be located in loop preheader");
5068 // Check in case no argument structure has been passed.
5069 if (OutlinedFnCallInstruction->arg_size() > 1)
5070 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5071 else
5072 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5073 OutlinedFnCallInstruction->eraseFromParent();
5074
5075 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5076 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5077
5078 for (auto &ToBeDeletedItem : ToBeDeleted)
5079 ToBeDeletedItem->eraseFromParent();
5080 CLI->invalidate();
5081}
5082
5083OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5084 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5085 WorksharingLoopType LoopType, bool NoLoop) {
5086 uint32_t SrcLocStrSize;
5087 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5088 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5089
5090 OutlineInfo OI;
5091 OI.OuterAllocaBB = CLI->getPreheader();
5092 Function *OuterFn = CLI->getPreheader()->getParent();
5093
5094 // Instructions which need to be deleted at the end of code generation
5096
5097 OI.OuterAllocaBB = AllocaIP.getBlock();
5098
5099 // Mark the body loop as region which needs to be extracted
5100 OI.EntryBB = CLI->getBody();
5101 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5102 "omp.prelatch", true);
5103
5104 // Prepare loop body for extraction
5105 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5106
5107 // Insert new loop counter variable which will be used only in loop
5108 // body.
5109 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5110 Instruction *NewLoopCntLoad =
5111 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5112 // New loop counter instructions are redundant in the loop preheader when
5113 // code generation for workshare loop is finshed. That's why mark them as
5114 // ready for deletion.
5115 ToBeDeleted.push_back(NewLoopCntLoad);
5116 ToBeDeleted.push_back(NewLoopCnt);
5117
5118 // Analyse loop body region. Find all input variables which are used inside
5119 // loop body region.
5120 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5122 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5123
5124 CodeExtractorAnalysisCache CEAC(*OuterFn);
5125 CodeExtractor Extractor(Blocks,
5126 /* DominatorTree */ nullptr,
5127 /* AggregateArgs */ true,
5128 /* BlockFrequencyInfo */ nullptr,
5129 /* BranchProbabilityInfo */ nullptr,
5130 /* AssumptionCache */ nullptr,
5131 /* AllowVarArgs */ true,
5132 /* AllowAlloca */ true,
5133 /* AllocationBlock */ CLI->getPreheader(),
5134 /* Suffix */ ".omp_wsloop",
5135 /* AggrArgsIn0AddrSpace */ true);
5136
5137 BasicBlock *CommonExit = nullptr;
5138 SetVector<Value *> SinkingCands, HoistingCands;
5139
5140 // Find allocas outside the loop body region which are used inside loop
5141 // body
5142 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5143
5144 // We need to model loop body region as the function f(cnt, loop_arg).
5145 // That's why we replace loop induction variable by the new counter
5146 // which will be one of loop body function argument
5147 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5148 CLI->getIndVar()->user_end());
5149 for (auto Use : Users) {
5150 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5151 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5152 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5153 }
5154 }
5155 }
5156 // Make sure that loop counter variable is not merged into loop body
5157 // function argument structure and it is passed as separate variable
5158 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5159
5160 // PostOutline CB is invoked when loop body function is outlined and
5161 // loop body is replaced by call to outlined function. We need to add
5162 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5163 // function will handle loop control logic.
5164 //
5165 OI.PostOutlineCB = [=, ToBeDeletedVec =
5166 std::move(ToBeDeleted)](Function &OutlinedFn) {
5167 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5168 LoopType, NoLoop);
5169 };
5170 addOutlineInfo(std::move(OI));
5171 return CLI->getAfterIP();
5172}
5173
5174OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5175 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5176 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5177 bool HasSimdModifier, bool HasMonotonicModifier,
5178 bool HasNonmonotonicModifier, bool HasOrderedClause,
5179 WorksharingLoopType LoopType, bool NoLoop) {
5180 if (Config.isTargetDevice())
5181 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5182 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5183 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5184 HasNonmonotonicModifier, HasOrderedClause);
5185
5186 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5187 OMPScheduleType::ModifierOrdered;
5188 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5189 case OMPScheduleType::BaseStatic:
5190 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5191 if (IsOrdered)
5192 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5193 NeedsBarrier, ChunkSize);
5194 // FIXME: Monotonicity ignored?
5195 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5196
5197 case OMPScheduleType::BaseStaticChunked:
5198 if (IsOrdered)
5199 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5200 NeedsBarrier, ChunkSize);
5201 // FIXME: Monotonicity ignored?
5202 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5203 ChunkSize);
5204
5205 case OMPScheduleType::BaseRuntime:
5206 case OMPScheduleType::BaseAuto:
5207 case OMPScheduleType::BaseGreedy:
5208 case OMPScheduleType::BaseBalanced:
5209 case OMPScheduleType::BaseSteal:
5210 case OMPScheduleType::BaseGuidedSimd:
5211 case OMPScheduleType::BaseRuntimeSimd:
5212 assert(!ChunkSize &&
5213 "schedule type does not support user-defined chunk sizes");
5214 [[fallthrough]];
5215 case OMPScheduleType::BaseDynamicChunked:
5216 case OMPScheduleType::BaseGuidedChunked:
5217 case OMPScheduleType::BaseGuidedIterativeChunked:
5218 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5219 case OMPScheduleType::BaseStaticBalancedChunked:
5220 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5221 NeedsBarrier, ChunkSize);
5222
5223 default:
5224 llvm_unreachable("Unknown/unimplemented schedule kind");
5225 }
5226}
5227
5228/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5229/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5230/// the runtime. Always interpret integers as unsigned similarly to
5231/// CanonicalLoopInfo.
5232static FunctionCallee
5233getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5234 unsigned Bitwidth = Ty->getIntegerBitWidth();
5235 if (Bitwidth == 32)
5236 return OMPBuilder.getOrCreateRuntimeFunction(
5237 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5238 if (Bitwidth == 64)
5239 return OMPBuilder.getOrCreateRuntimeFunction(
5240 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5241 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5242}
5243
5244/// Returns an LLVM function to call for updating the next loop using OpenMP
5245/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5246/// the runtime. Always interpret integers as unsigned similarly to
5247/// CanonicalLoopInfo.
5248static FunctionCallee
5249getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5250 unsigned Bitwidth = Ty->getIntegerBitWidth();
5251 if (Bitwidth == 32)
5252 return OMPBuilder.getOrCreateRuntimeFunction(
5253 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5254 if (Bitwidth == 64)
5255 return OMPBuilder.getOrCreateRuntimeFunction(
5256 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5257 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5258}
5259
5260/// Returns an LLVM function to call for finalizing the dynamic loop using
5261/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5262/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5263static FunctionCallee
5264getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5265 unsigned Bitwidth = Ty->getIntegerBitWidth();
5266 if (Bitwidth == 32)
5267 return OMPBuilder.getOrCreateRuntimeFunction(
5268 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5269 if (Bitwidth == 64)
5270 return OMPBuilder.getOrCreateRuntimeFunction(
5271 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5272 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5273}
5274
5275OpenMPIRBuilder::InsertPointOrErrorTy
5276OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5277 InsertPointTy AllocaIP,
5278 OMPScheduleType SchedType,
5279 bool NeedsBarrier, Value *Chunk) {
5280 assert(CLI->isValid() && "Requires a valid canonical loop");
5281 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5282 "Require dedicated allocate IP");
5284 "Require valid schedule type");
5285
5286 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5287 OMPScheduleType::ModifierOrdered;
5288
5289 // Set up the source location value for OpenMP runtime.
5290 Builder.SetCurrentDebugLocation(DL);
5291
5292 uint32_t SrcLocStrSize;
5293 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5294 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5295
5296 // Declare useful OpenMP runtime functions.
5297 Value *IV = CLI->getIndVar();
5298 Type *IVTy = IV->getType();
5299 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5300 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5301
5302 // Allocate space for computed loop bounds as expected by the "init" function.
5303 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5304 Type *I32Type = Type::getInt32Ty(M.getContext());
5305 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5306 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5307 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5308 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5309 CLI->setLastIter(PLastIter);
5310
5311 // At the end of the preheader, prepare for calling the "init" function by
5312 // storing the current loop bounds into the allocated space. A canonical loop
5313 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5314 // and produces an inclusive upper bound.
5315 BasicBlock *PreHeader = CLI->getPreheader();
5316 Builder.SetInsertPoint(PreHeader->getTerminator());
5317 Constant *One = ConstantInt::get(IVTy, 1);
5318 Builder.CreateStore(One, PLowerBound);
5319 Value *UpperBound = CLI->getTripCount();
5320 Builder.CreateStore(UpperBound, PUpperBound);
5321 Builder.CreateStore(One, PStride);
5322
5323 BasicBlock *Header = CLI->getHeader();
5324 BasicBlock *Exit = CLI->getExit();
5325 BasicBlock *Cond = CLI->getCond();
5326 BasicBlock *Latch = CLI->getLatch();
5327 InsertPointTy AfterIP = CLI->getAfterIP();
5328
5329 // The CLI will be "broken" in the code below, as the loop is no longer
5330 // a valid canonical loop.
5331
5332 if (!Chunk)
5333 Chunk = One;
5334
5335 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5336
5337 Constant *SchedulingType =
5338 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5339
5340 // Call the "init" function.
5341 Builder.CreateCall(DynamicInit,
5342 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5343 UpperBound, /* step */ One, Chunk});
5344
5345 // An outer loop around the existing one.
5346 BasicBlock *OuterCond = BasicBlock::Create(
5347 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5348 PreHeader->getParent());
5349 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5350 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5351 Value *Res =
5352 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5353 PLowerBound, PUpperBound, PStride});
5354 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5355 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5356 Value *LowerBound =
5357 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5358 Builder.CreateCondBr(MoreWork, Header, Exit);
5359
5360 // Change PHI-node in loop header to use outer cond rather than preheader,
5361 // and set IV to the LowerBound.
5362 Instruction *Phi = &Header->front();
5363 auto *PI = cast<PHINode>(Phi);
5364 PI->setIncomingBlock(0, OuterCond);
5365 PI->setIncomingValue(0, LowerBound);
5366
5367 // Then set the pre-header to jump to the OuterCond
5368 Instruction *Term = PreHeader->getTerminator();
5369 auto *Br = cast<BranchInst>(Term);
5370 Br->setSuccessor(0, OuterCond);
5371
5372 // Modify the inner condition:
5373 // * Use the UpperBound returned from the DynamicNext call.
5374 // * jump to the loop outer loop when done with one of the inner loops.
5375 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5376 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5377 Instruction *Comp = &*Builder.GetInsertPoint();
5378 auto *CI = cast<CmpInst>(Comp);
5379 CI->setOperand(1, UpperBound);
5380 // Redirect the inner exit to branch to outer condition.
5381 Instruction *Branch = &Cond->back();
5382 auto *BI = cast<BranchInst>(Branch);
5383 assert(BI->getSuccessor(1) == Exit);
5384 BI->setSuccessor(1, OuterCond);
5385
5386 // Call the "fini" function if "ordered" is present in wsloop directive.
5387 if (Ordered) {
5388 Builder.SetInsertPoint(&Latch->back());
5389 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5390 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5391 }
5392
5393 // Add the barrier if requested.
5394 if (NeedsBarrier) {
5395 Builder.SetInsertPoint(&Exit->back());
5396 InsertPointOrErrorTy BarrierIP =
5397 createBarrier(LocationDescription(Builder.saveIP(), DL),
5398 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5399 /* CheckCancelFlag */ false);
5400 if (!BarrierIP)
5401 return BarrierIP.takeError();
5402 }
5403
5404 CLI->invalidate();
5405 return AfterIP;
5406}
5407
5408/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5409/// after this \p OldTarget will be orphaned.
5411 BasicBlock *NewTarget, DebugLoc DL) {
5412 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5413 redirectTo(Pred, NewTarget, DL);
5414}
5415
5416/// Determine which blocks in \p BBs are reachable from outside and remove the
5417/// ones that are not reachable from the function.
5420 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5421 for (Use &U : BB->uses()) {
5422 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5423 if (!UseInst)
5424 continue;
5425 if (BBsToErase.count(UseInst->getParent()))
5426 continue;
5427 return true;
5428 }
5429 return false;
5430 };
5431
5432 while (BBsToErase.remove_if(HasRemainingUses)) {
5433 // Try again if anything was removed.
5434 }
5435
5436 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5437 DeleteDeadBlocks(BBVec);
5438}
5439
5440CanonicalLoopInfo *
5441OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5442 InsertPointTy ComputeIP) {
5443 assert(Loops.size() >= 1 && "At least one loop required");
5444 size_t NumLoops = Loops.size();
5445
5446 // Nothing to do if there is already just one loop.
5447 if (NumLoops == 1)
5448 return Loops.front();
5449
5450 CanonicalLoopInfo *Outermost = Loops.front();
5451 CanonicalLoopInfo *Innermost = Loops.back();
5452 BasicBlock *OrigPreheader = Outermost->getPreheader();
5453 BasicBlock *OrigAfter = Outermost->getAfter();
5454 Function *F = OrigPreheader->getParent();
5455
5456 // Loop control blocks that may become orphaned later.
5457 SmallVector<BasicBlock *, 12> OldControlBBs;
5458 OldControlBBs.reserve(6 * Loops.size());
5459 for (CanonicalLoopInfo *Loop : Loops)
5460 Loop->collectControlBlocks(OldControlBBs);
5461
5462 // Setup the IRBuilder for inserting the trip count computation.
5463 Builder.SetCurrentDebugLocation(DL);
5464 if (ComputeIP.isSet())
5465 Builder.restoreIP(ComputeIP);
5466 else
5467 Builder.restoreIP(Outermost->getPreheaderIP());
5468
5469 // Derive the collapsed' loop trip count.
5470 // TODO: Find common/largest indvar type.
5471 Value *CollapsedTripCount = nullptr;
5472 for (CanonicalLoopInfo *L : Loops) {
5473 assert(L->isValid() &&
5474 "All loops to collapse must be valid canonical loops");
5475 Value *OrigTripCount = L->getTripCount();
5476 if (!CollapsedTripCount) {
5477 CollapsedTripCount = OrigTripCount;
5478 continue;
5479 }
5480
5481 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5482 CollapsedTripCount =
5483 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5484 }
5485
5486 // Create the collapsed loop control flow.
5487 CanonicalLoopInfo *Result =
5488 createLoopSkeleton(DL, CollapsedTripCount, F,
5489 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5490
5491 // Build the collapsed loop body code.
5492 // Start with deriving the input loop induction variables from the collapsed
5493 // one, using a divmod scheme. To preserve the original loops' order, the
5494 // innermost loop use the least significant bits.
5495 Builder.restoreIP(Result->getBodyIP());
5496
5497 Value *Leftover = Result->getIndVar();
5498 SmallVector<Value *> NewIndVars;
5499 NewIndVars.resize(NumLoops);
5500 for (int i = NumLoops - 1; i >= 1; --i) {
5501 Value *OrigTripCount = Loops[i]->getTripCount();
5502
5503 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5504 NewIndVars[i] = NewIndVar;
5505
5506 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5507 }
5508 // Outermost loop gets all the remaining bits.
5509 NewIndVars[0] = Leftover;
5510
5511 // Construct the loop body control flow.
5512 // We progressively construct the branch structure following in direction of
5513 // the control flow, from the leading in-between code, the loop nest body, the
5514 // trailing in-between code, and rejoining the collapsed loop's latch.
5515 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5516 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5517 // its predecessors as sources.
5518 BasicBlock *ContinueBlock = Result->getBody();
5519 BasicBlock *ContinuePred = nullptr;
5520 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5521 BasicBlock *NextSrc) {
5522 if (ContinueBlock)
5523 redirectTo(ContinueBlock, Dest, DL);
5524 else
5525 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5526
5527 ContinueBlock = nullptr;
5528 ContinuePred = NextSrc;
5529 };
5530
5531 // The code before the nested loop of each level.
5532 // Because we are sinking it into the nest, it will be executed more often
5533 // that the original loop. More sophisticated schemes could keep track of what
5534 // the in-between code is and instantiate it only once per thread.
5535 for (size_t i = 0; i < NumLoops - 1; ++i)
5536 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5537
5538 // Connect the loop nest body.
5539 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5540
5541 // The code after the nested loop at each level.
5542 for (size_t i = NumLoops - 1; i > 0; --i)
5543 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5544
5545 // Connect the finished loop to the collapsed loop latch.
5546 ContinueWith(Result->getLatch(), nullptr);
5547
5548 // Replace the input loops with the new collapsed loop.
5549 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5550 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5551
5552 // Replace the input loop indvars with the derived ones.
5553 for (size_t i = 0; i < NumLoops; ++i)
5554 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5555
5556 // Remove unused parts of the input loops.
5557 removeUnusedBlocksFromParent(OldControlBBs);
5558
5559 for (CanonicalLoopInfo *L : Loops)
5560 L->invalidate();
5561
5562#ifndef NDEBUG
5563 Result->assertOK();
5564#endif
5565 return Result;
5566}
5567
5568std::vector<CanonicalLoopInfo *>
5569OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5570 ArrayRef<Value *> TileSizes) {
5571 assert(TileSizes.size() == Loops.size() &&
5572 "Must pass as many tile sizes as there are loops");
5573 int NumLoops = Loops.size();
5574 assert(NumLoops >= 1 && "At least one loop to tile required");
5575
5576 CanonicalLoopInfo *OutermostLoop = Loops.front();
5577 CanonicalLoopInfo *InnermostLoop = Loops.back();
5578 Function *F = OutermostLoop->getBody()->getParent();
5579 BasicBlock *InnerEnter = InnermostLoop->getBody();
5580 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5581
5582 // Loop control blocks that may become orphaned later.
5583 SmallVector<BasicBlock *, 12> OldControlBBs;
5584 OldControlBBs.reserve(6 * Loops.size());
5585 for (CanonicalLoopInfo *Loop : Loops)
5586 Loop->collectControlBlocks(OldControlBBs);
5587
5588 // Collect original trip counts and induction variable to be accessible by
5589 // index. Also, the structure of the original loops is not preserved during
5590 // the construction of the tiled loops, so do it before we scavenge the BBs of
5591 // any original CanonicalLoopInfo.
5592 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5593 for (CanonicalLoopInfo *L : Loops) {
5594 assert(L->isValid() && "All input loops must be valid canonical loops");
5595 OrigTripCounts.push_back(L->getTripCount());
5596 OrigIndVars.push_back(L->getIndVar());
5597 }
5598
5599 // Collect the code between loop headers. These may contain SSA definitions
5600 // that are used in the loop nest body. To be usable with in the innermost
5601 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5602 // these instructions may be executed more often than before the tiling.
5603 // TODO: It would be sufficient to only sink them into body of the
5604 // corresponding tile loop.
5606 for (int i = 0; i < NumLoops - 1; ++i) {
5607 CanonicalLoopInfo *Surrounding = Loops[i];
5608 CanonicalLoopInfo *Nested = Loops[i + 1];
5609
5610 BasicBlock *EnterBB = Surrounding->getBody();
5611 BasicBlock *ExitBB = Nested->getHeader();
5612 InbetweenCode.emplace_back(EnterBB, ExitBB);
5613 }
5614
5615 // Compute the trip counts of the floor loops.
5616 Builder.SetCurrentDebugLocation(DL);
5617 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5618 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5619 for (int i = 0; i < NumLoops; ++i) {
5620 Value *TileSize = TileSizes[i];
5621 Value *OrigTripCount = OrigTripCounts[i];
5622 Type *IVType = OrigTripCount->getType();
5623
5624 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5625 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5626
5627 // 0 if tripcount divides the tilesize, 1 otherwise.
5628 // 1 means we need an additional iteration for a partial tile.
5629 //
5630 // Unfortunately we cannot just use the roundup-formula
5631 // (tripcount + tilesize - 1)/tilesize
5632 // because the summation might overflow. We do not want introduce undefined
5633 // behavior when the untiled loop nest did not.
5634 Value *FloorTripOverflow =
5635 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5636
5637 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5638 Value *FloorTripCount =
5639 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5640 "omp_floor" + Twine(i) + ".tripcount", true);
5641
5642 // Remember some values for later use.
5643 FloorCompleteCount.push_back(FloorCompleteTripCount);
5644 FloorCount.push_back(FloorTripCount);
5645 FloorRems.push_back(FloorTripRem);
5646 }
5647
5648 // Generate the new loop nest, from the outermost to the innermost.
5649 std::vector<CanonicalLoopInfo *> Result;
5650 Result.reserve(NumLoops * 2);
5651
5652 // The basic block of the surrounding loop that enters the nest generated
5653 // loop.
5654 BasicBlock *Enter = OutermostLoop->getPreheader();
5655
5656 // The basic block of the surrounding loop where the inner code should
5657 // continue.
5658 BasicBlock *Continue = OutermostLoop->getAfter();
5659
5660 // Where the next loop basic block should be inserted.
5661 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5662
5663 auto EmbeddNewLoop =
5664 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5665 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5666 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5667 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5668 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5669 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5670
5671 // Setup the position where the next embedded loop connects to this loop.
5672 Enter = EmbeddedLoop->getBody();
5673 Continue = EmbeddedLoop->getLatch();
5674 OutroInsertBefore = EmbeddedLoop->getLatch();
5675 return EmbeddedLoop;
5676 };
5677
5678 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5679 const Twine &NameBase) {
5680 for (auto P : enumerate(TripCounts)) {
5681 CanonicalLoopInfo *EmbeddedLoop =
5682 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5683 Result.push_back(EmbeddedLoop);
5684 }
5685 };
5686
5687 EmbeddNewLoops(FloorCount, "floor");
5688
5689 // Within the innermost floor loop, emit the code that computes the tile
5690 // sizes.
5691 Builder.SetInsertPoint(Enter->getTerminator());
5692 SmallVector<Value *, 4> TileCounts;
5693 for (int i = 0; i < NumLoops; ++i) {
5694 CanonicalLoopInfo *FloorLoop = Result[i];
5695 Value *TileSize = TileSizes[i];
5696
5697 Value *FloorIsEpilogue =
5698 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5699 Value *TileTripCount =
5700 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5701
5702 TileCounts.push_back(TileTripCount);
5703 }
5704
5705 // Create the tile loops.
5706 EmbeddNewLoops(TileCounts, "tile");
5707
5708 // Insert the inbetween code into the body.
5709 BasicBlock *BodyEnter = Enter;
5710 BasicBlock *BodyEntered = nullptr;
5711 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5712 BasicBlock *EnterBB = P.first;
5713 BasicBlock *ExitBB = P.second;
5714
5715 if (BodyEnter)
5716 redirectTo(BodyEnter, EnterBB, DL);
5717 else
5718 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5719
5720 BodyEnter = nullptr;
5721 BodyEntered = ExitBB;
5722 }
5723
5724 // Append the original loop nest body into the generated loop nest body.
5725 if (BodyEnter)
5726 redirectTo(BodyEnter, InnerEnter, DL);
5727 else
5728 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5730
5731 // Replace the original induction variable with an induction variable computed
5732 // from the tile and floor induction variables.
5733 Builder.restoreIP(Result.back()->getBodyIP());
5734 for (int i = 0; i < NumLoops; ++i) {
5735 CanonicalLoopInfo *FloorLoop = Result[i];
5736 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5737 Value *OrigIndVar = OrigIndVars[i];
5738 Value *Size = TileSizes[i];
5739
5740 Value *Scale =
5741 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5742 Value *Shift =
5743 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5744 OrigIndVar->replaceAllUsesWith(Shift);
5745 }
5746
5747 // Remove unused parts of the original loops.
5748 removeUnusedBlocksFromParent(OldControlBBs);
5749
5750 for (CanonicalLoopInfo *L : Loops)
5751 L->invalidate();
5752
5753#ifndef NDEBUG
5754 for (CanonicalLoopInfo *GenL : Result)
5755 GenL->assertOK();
5756#endif
5757 return Result;
5758}
5759
5760/// Attach metadata \p Properties to the basic block described by \p BB. If the
5761/// basic block already has metadata, the basic block properties are appended.
5763 ArrayRef<Metadata *> Properties) {
5764 // Nothing to do if no property to attach.
5765 if (Properties.empty())
5766 return;
5767
5768 LLVMContext &Ctx = BB->getContext();
5769 SmallVector<Metadata *> NewProperties;
5770 NewProperties.push_back(nullptr);
5771
5772 // If the basic block already has metadata, prepend it to the new metadata.
5773 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5774 if (Existing)
5775 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5776
5777 append_range(NewProperties, Properties);
5778 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5779 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5780
5781 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5782}
5783
5784/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5785/// loop already has metadata, the loop properties are appended.
5786static void addLoopMetadata(CanonicalLoopInfo *Loop,
5787 ArrayRef<Metadata *> Properties) {
5788 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5789
5790 // Attach metadata to the loop's latch
5791 BasicBlock *Latch = Loop->getLatch();
5792 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5793 addBasicBlockMetadata(Latch, Properties);
5794}
5795
5796/// Attach llvm.access.group metadata to the memref instructions of \p Block
5797static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5798 LoopInfo &LI) {
5799 for (Instruction &I : *Block) {
5800 if (I.mayReadOrWriteMemory()) {
5801 // TODO: This instruction may already have access group from
5802 // other pragmas e.g. #pragma clang loop vectorize. Append
5803 // so that the existing metadata is not overwritten.
5804 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5805 }
5806 }
5807}
5808
5809void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5810 LLVMContext &Ctx = Builder.getContext();
5812 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5813 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5814}
5815
5816void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5817 LLVMContext &Ctx = Builder.getContext();
5819 Loop, {
5820 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5821 });
5822}
5823
5824void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5825 Value *IfCond, ValueToValueMapTy &VMap,
5826 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5827 const Twine &NamePrefix) {
5828 Function *F = CanonicalLoop->getFunction();
5829
5830 // We can't do
5831 // if (cond) {
5832 // simd_loop;
5833 // } else {
5834 // non_simd_loop;
5835 // }
5836 // because then the CanonicalLoopInfo would only point to one of the loops:
5837 // leading to other constructs operating on the same loop to malfunction.
5838 // Instead generate
5839 // while (...) {
5840 // if (cond) {
5841 // simd_body;
5842 // } else {
5843 // not_simd_body;
5844 // }
5845 // }
5846 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5847 // body at -O3
5848
5849 // Define where if branch should be inserted
5850 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5851
5852 // Create additional blocks for the if statement
5853 BasicBlock *Cond = SplitBeforeIt->getParent();
5854 llvm::LLVMContext &C = Cond->getContext();
5856 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5858 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5859
5860 // Create if condition branch.
5861 Builder.SetInsertPoint(SplitBeforeIt);
5862 Instruction *BrInstr =
5863 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5864 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5865 // Then block contains branch to omp loop body which needs to be vectorized
5866 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5867 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5868
5869 Builder.SetInsertPoint(ElseBlock);
5870
5871 // Clone loop for the else branch
5873
5874 SmallVector<BasicBlock *, 8> ExistingBlocks;
5875 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5876 ExistingBlocks.push_back(ThenBlock);
5877 ExistingBlocks.append(L->block_begin(), L->block_end());
5878 // Cond is the block that has the if clause condition
5879 // LoopCond is omp_loop.cond
5880 // LoopHeader is omp_loop.header
5881 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5882 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5883 assert(LoopCond && LoopHeader && "Invalid loop structure");
5884 for (BasicBlock *Block : ExistingBlocks) {
5885 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5886 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5887 continue;
5888 }
5889 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5890
5891 // fix name not to be omp.if.then
5892 if (Block == ThenBlock)
5893 NewBB->setName(NamePrefix + ".if.else");
5894
5895 NewBB->moveBefore(CanonicalLoop->getExit());
5896 VMap[Block] = NewBB;
5897 NewBlocks.push_back(NewBB);
5898 }
5899 remapInstructionsInBlocks(NewBlocks, VMap);
5900 Builder.CreateBr(NewBlocks.front());
5901
5902 // The loop latch must have only one predecessor. Currently it is branched to
5903 // from both the 'then' and 'else' branches.
5904 L->getLoopLatch()->splitBasicBlock(
5905 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5906
5907 // Ensure that the then block is added to the loop so we add the attributes in
5908 // the next step
5909 L->addBasicBlockToLoop(ThenBlock, LI);
5910}
5911
5912unsigned
5913OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5914 const StringMap<bool> &Features) {
5915 if (TargetTriple.isX86()) {
5916 if (Features.lookup("avx512f"))
5917 return 512;
5918 else if (Features.lookup("avx"))
5919 return 256;
5920 return 128;
5921 }
5922 if (TargetTriple.isPPC())
5923 return 128;
5924 if (TargetTriple.isWasm())
5925 return 128;
5926 return 0;
5927}
5928
5929void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5930 MapVector<Value *, Value *> AlignedVars,
5931 Value *IfCond, OrderKind Order,
5932 ConstantInt *Simdlen, ConstantInt *Safelen) {
5933 LLVMContext &Ctx = Builder.getContext();
5934
5935 Function *F = CanonicalLoop->getFunction();
5936
5937 // TODO: We should not rely on pass manager. Currently we use pass manager
5938 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5939 // object. We should have a method which returns all blocks between
5940 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5942 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5943 FAM.registerPass([]() { return LoopAnalysis(); });
5944 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5945
5946 LoopAnalysis LIA;
5947 LoopInfo &&LI = LIA.run(*F, FAM);
5948
5949 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5950 if (AlignedVars.size()) {
5951 InsertPointTy IP = Builder.saveIP();
5952 for (auto &AlignedItem : AlignedVars) {
5953 Value *AlignedPtr = AlignedItem.first;
5954 Value *Alignment = AlignedItem.second;
5955 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5956 Builder.SetInsertPoint(loadInst->getNextNode());
5957 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5958 Alignment);
5959 }
5960 Builder.restoreIP(IP);
5961 }
5962
5963 if (IfCond) {
5964 ValueToValueMapTy VMap;
5965 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5966 }
5967
5969
5970 // Get the basic blocks from the loop in which memref instructions
5971 // can be found.
5972 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5973 // preferably without running any passes.
5974 for (BasicBlock *Block : L->getBlocks()) {
5975 if (Block == CanonicalLoop->getCond() ||
5976 Block == CanonicalLoop->getHeader())
5977 continue;
5978 Reachable.insert(Block);
5979 }
5980
5981 SmallVector<Metadata *> LoopMDList;
5982
5983 // In presence of finite 'safelen', it may be unsafe to mark all
5984 // the memory instructions parallel, because loop-carried
5985 // dependences of 'safelen' iterations are possible.
5986 // If clause order(concurrent) is specified then the memory instructions
5987 // are marked parallel even if 'safelen' is finite.
5988 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5989 // Add access group metadata to memory-access instructions.
5990 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5991 for (BasicBlock *BB : Reachable)
5992 addSimdMetadata(BB, AccessGroup, LI);
5993 // TODO: If the loop has existing parallel access metadata, have
5994 // to combine two lists.
5995 LoopMDList.push_back(MDNode::get(
5996 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5997 }
5998
5999 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6000 // versions so we can't add the loop attributes in that case.
6001 if (IfCond) {
6002 // we can still add llvm.loop.parallel_access
6003 addLoopMetadata(CanonicalLoop, LoopMDList);
6004 return;
6005 }
6006
6007 // Use the above access group metadata to create loop level
6008 // metadata, which should be distinct for each loop.
6009 ConstantAsMetadata *BoolConst =
6011 LoopMDList.push_back(MDNode::get(
6012 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6013
6014 if (Simdlen || Safelen) {
6015 // If both simdlen and safelen clauses are specified, the value of the
6016 // simdlen parameter must be less than or equal to the value of the safelen
6017 // parameter. Therefore, use safelen only in the absence of simdlen.
6018 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6019 LoopMDList.push_back(
6020 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6021 ConstantAsMetadata::get(VectorizeWidth)}));
6022 }
6023
6024 addLoopMetadata(CanonicalLoop, LoopMDList);
6025}
6026
6027/// Create the TargetMachine object to query the backend for optimization
6028/// preferences.
6029///
6030/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6031/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6032/// needed for the LLVM pass pipline. We use some default options to avoid
6033/// having to pass too many settings from the frontend that probably do not
6034/// matter.
6035///
6036/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6037/// method. If we are going to use TargetMachine for more purposes, especially
6038/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6039/// might become be worth requiring front-ends to pass on their TargetMachine,
6040/// or at least cache it between methods. Note that while fontends such as Clang
6041/// have just a single main TargetMachine per translation unit, "target-cpu" and
6042/// "target-features" that determine the TargetMachine are per-function and can
6043/// be overrided using __attribute__((target("OPTIONS"))).
6044static std::unique_ptr<TargetMachine>
6046 Module *M = F->getParent();
6047
6048 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6049 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6050 const llvm::Triple &Triple = M->getTargetTriple();
6051
6052 std::string Error;
6054 if (!TheTarget)
6055 return {};
6056
6058 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6059 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6060 /*CodeModel=*/std::nullopt, OptLevel));
6061}
6062
6063/// Heuristically determine the best-performant unroll factor for \p CLI. This
6064/// depends on the target processor. We are re-using the same heuristics as the
6065/// LoopUnrollPass.
6066static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6067 Function *F = CLI->getFunction();
6068
6069 // Assume the user requests the most aggressive unrolling, even if the rest of
6070 // the code is optimized using a lower setting.
6072 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6073
6075 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6076 FAM.registerPass([]() { return AssumptionAnalysis(); });
6077 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6078 FAM.registerPass([]() { return LoopAnalysis(); });
6079 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6080 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6081 TargetIRAnalysis TIRA;
6082 if (TM)
6083 TIRA = TargetIRAnalysis(
6084 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6085 FAM.registerPass([&]() { return TIRA; });
6086
6087 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6089 ScalarEvolution &&SE = SEA.run(*F, FAM);
6091 DominatorTree &&DT = DTA.run(*F, FAM);
6092 LoopAnalysis LIA;
6093 LoopInfo &&LI = LIA.run(*F, FAM);
6095 AssumptionCache &&AC = ACT.run(*F, FAM);
6097
6098 Loop *L = LI.getLoopFor(CLI->getHeader());
6099 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6100
6102 L, SE, TTI,
6103 /*BlockFrequencyInfo=*/nullptr,
6104 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6105 /*UserThreshold=*/std::nullopt,
6106 /*UserCount=*/std::nullopt,
6107 /*UserAllowPartial=*/true,
6108 /*UserAllowRuntime=*/true,
6109 /*UserUpperBound=*/std::nullopt,
6110 /*UserFullUnrollMaxCount=*/std::nullopt);
6111
6112 UP.Force = true;
6113
6114 // Account for additional optimizations taking place before the LoopUnrollPass
6115 // would unroll the loop.
6118
6119 // Use normal unroll factors even if the rest of the code is optimized for
6120 // size.
6123
6124 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6125 << " Threshold=" << UP.Threshold << "\n"
6126 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6127 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6128 << " PartialOptSizeThreshold="
6129 << UP.PartialOptSizeThreshold << "\n");
6130
6131 // Disable peeling.
6134 /*UserAllowPeeling=*/false,
6135 /*UserAllowProfileBasedPeeling=*/false,
6136 /*UnrollingSpecficValues=*/false);
6137
6139 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6140
6141 // Assume that reads and writes to stack variables can be eliminated by
6142 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6143 // size.
6144 for (BasicBlock *BB : L->blocks()) {
6145 for (Instruction &I : *BB) {
6146 Value *Ptr;
6147 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6148 Ptr = Load->getPointerOperand();
6149 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6150 Ptr = Store->getPointerOperand();
6151 } else
6152 continue;
6153
6154 Ptr = Ptr->stripPointerCasts();
6155
6156 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6157 if (Alloca->getParent() == &F->getEntryBlock())
6158 EphValues.insert(&I);
6159 }
6160 }
6161 }
6162
6163 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6164
6165 // Loop is not unrollable if the loop contains certain instructions.
6166 if (!UCE.canUnroll()) {
6167 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6168 return 1;
6169 }
6170
6171 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6172 << "\n");
6173
6174 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6175 // be able to use it.
6176 int TripCount = 0;
6177 int MaxTripCount = 0;
6178 bool MaxOrZero = false;
6179 unsigned TripMultiple = 0;
6180
6181 bool UseUpperBound = false;
6182 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6183 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6184 UseUpperBound);
6185 unsigned Factor = UP.Count;
6186 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6187
6188 // This function returns 1 to signal to not unroll a loop.
6189 if (Factor == 0)
6190 return 1;
6191 return Factor;
6192}
6193
6194void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6195 int32_t Factor,
6196 CanonicalLoopInfo **UnrolledCLI) {
6197 assert(Factor >= 0 && "Unroll factor must not be negative");
6198
6199 Function *F = Loop->getFunction();
6200 LLVMContext &Ctx = F->getContext();
6201
6202 // If the unrolled loop is not used for another loop-associated directive, it
6203 // is sufficient to add metadata for the LoopUnrollPass.
6204 if (!UnrolledCLI) {
6205 SmallVector<Metadata *, 2> LoopMetadata;
6206 LoopMetadata.push_back(
6207 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6208
6209 if (Factor >= 1) {
6211 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6212 LoopMetadata.push_back(MDNode::get(
6213 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6214 }
6215
6216 addLoopMetadata(Loop, LoopMetadata);
6217 return;
6218 }
6219
6220 // Heuristically determine the unroll factor.
6221 if (Factor == 0)
6223
6224 // No change required with unroll factor 1.
6225 if (Factor == 1) {
6226 *UnrolledCLI = Loop;
6227 return;
6228 }
6229
6230 assert(Factor >= 2 &&
6231 "unrolling only makes sense with a factor of 2 or larger");
6232
6233 Type *IndVarTy = Loop->getIndVarType();
6234
6235 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6236 // unroll the inner loop.
6237 Value *FactorVal =
6238 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6239 /*isSigned=*/false));
6240 std::vector<CanonicalLoopInfo *> LoopNest =
6241 tileLoops(DL, {Loop}, {FactorVal});
6242 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6243 *UnrolledCLI = LoopNest[0];
6244 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6245
6246 // LoopUnrollPass can only fully unroll loops with constant trip count.
6247 // Unroll by the unroll factor with a fallback epilog for the remainder
6248 // iterations if necessary.
6250 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6252 InnerLoop,
6253 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6255 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6256
6257#ifndef NDEBUG
6258 (*UnrolledCLI)->assertOK();
6259#endif
6260}
6261
6262OpenMPIRBuilder::InsertPointTy
6263OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6264 llvm::Value *BufSize, llvm::Value *CpyBuf,
6265 llvm::Value *CpyFn, llvm::Value *DidIt) {
6266 if (!updateToLocation(Loc))
6267 return Loc.IP;
6268
6269 uint32_t SrcLocStrSize;
6270 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6271 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6272 Value *ThreadId = getOrCreateThreadID(Ident);
6273
6274 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6275
6276 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6277
6278 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6279 Builder.CreateCall(Fn, Args);
6280
6281 return Builder.saveIP();
6282}
6283
6284OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6285 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6286 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6288
6289 if (!updateToLocation(Loc))
6290 return Loc.IP;
6291
6292 // If needed allocate and initialize `DidIt` with 0.
6293 // DidIt: flag variable: 1=single thread; 0=not single thread.
6294 llvm::Value *DidIt = nullptr;
6295 if (!CPVars.empty()) {
6296 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6297 Builder.CreateStore(Builder.getInt32(0), DidIt);
6298 }
6299
6300 Directive OMPD = Directive::OMPD_single;
6301 uint32_t SrcLocStrSize;
6302 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6303 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6304 Value *ThreadId = getOrCreateThreadID(Ident);
6305 Value *Args[] = {Ident, ThreadId};
6306
6307 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6308 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6309
6310 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6311 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6312
6313 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6314 if (Error Err = FiniCB(IP))
6315 return Err;
6316
6317 // The thread that executes the single region must set `DidIt` to 1.
6318 // This is used by __kmpc_copyprivate, to know if the caller is the
6319 // single thread or not.
6320 if (DidIt)
6321 Builder.CreateStore(Builder.getInt32(1), DidIt);
6322
6323 return Error::success();
6324 };
6325
6326 // generates the following:
6327 // if (__kmpc_single()) {
6328 // .... single region ...
6329 // __kmpc_end_single
6330 // }
6331 // __kmpc_copyprivate
6332 // __kmpc_barrier
6333
6334 InsertPointOrErrorTy AfterIP =
6335 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6336 /*Conditional*/ true,
6337 /*hasFinalize*/ true);
6338 if (!AfterIP)
6339 return AfterIP.takeError();
6340
6341 if (DidIt) {
6342 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6343 // NOTE BufSize is currently unused, so just pass 0.
6344 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6345 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6346 CPFuncs[I], DidIt);
6347 // NOTE __kmpc_copyprivate already inserts a barrier
6348 } else if (!IsNowait) {
6349 InsertPointOrErrorTy AfterIP =
6350 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6351 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6352 /* CheckCancelFlag */ false);
6353 if (!AfterIP)
6354 return AfterIP.takeError();
6355 }
6356 return Builder.saveIP();
6357}
6358
6359OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6360 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6361 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6362
6363 if (!updateToLocation(Loc))
6364 return Loc.IP;
6365
6366 Directive OMPD = Directive::OMPD_critical;
6367 uint32_t SrcLocStrSize;
6368 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6369 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6370 Value *ThreadId = getOrCreateThreadID(Ident);
6371 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6372 Value *Args[] = {Ident, ThreadId, LockVar};
6373
6374 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6375 Function *RTFn = nullptr;
6376 if (HintInst) {
6377 // Add Hint to entry Args and create call
6378 EnterArgs.push_back(HintInst);
6379 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6380 } else {
6381 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6382 }
6383 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6384
6385 Function *ExitRTLFn =
6386 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6387 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6388
6389 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6390 /*Conditional*/ false, /*hasFinalize*/ true);
6391}
6392
6393OpenMPIRBuilder::InsertPointTy
6394OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6395 InsertPointTy AllocaIP, unsigned NumLoops,
6396 ArrayRef<llvm::Value *> StoreValues,
6397 const Twine &Name, bool IsDependSource) {
6398 assert(
6399 llvm::all_of(StoreValues,
6400 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6401 "OpenMP runtime requires depend vec with i64 type");
6402
6403 if (!updateToLocation(Loc))
6404 return Loc.IP;
6405
6406 // Allocate space for vector and generate alloc instruction.
6407 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6408 Builder.restoreIP(AllocaIP);
6409 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6410 ArgsBase->setAlignment(Align(8));
6411 updateToLocation(Loc);
6412
6413 // Store the index value with offset in depend vector.
6414 for (unsigned I = 0; I < NumLoops; ++I) {
6415 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6416 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6417 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6418 STInst->setAlignment(Align(8));
6419 }
6420
6421 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6422 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6423
6424 uint32_t SrcLocStrSize;
6425 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6426 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6427 Value *ThreadId = getOrCreateThreadID(Ident);
6428 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6429
6430 Function *RTLFn = nullptr;
6431 if (IsDependSource)
6432 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6433 else
6434 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6435 Builder.CreateCall(RTLFn, Args);
6436
6437 return Builder.saveIP();
6438}
6439
6440OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6441 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6442 FinalizeCallbackTy FiniCB, bool IsThreads) {
6443 if (!updateToLocation(Loc))
6444 return Loc.IP;
6445
6446 Directive OMPD = Directive::OMPD_ordered;
6447 Instruction *EntryCall = nullptr;
6448 Instruction *ExitCall = nullptr;
6449
6450 if (IsThreads) {
6451 uint32_t SrcLocStrSize;
6452 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6453 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6454 Value *ThreadId = getOrCreateThreadID(Ident);
6455 Value *Args[] = {Ident, ThreadId};
6456
6457 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6458 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6459
6460 Function *ExitRTLFn =
6461 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6462 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6463 }
6464
6465 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6466 /*Conditional*/ false, /*hasFinalize*/ true);
6467}
6468
6469OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6470 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6471 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6472 bool HasFinalize, bool IsCancellable) {
6473
6474 if (HasFinalize)
6475 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6476
6477 // Create inlined region's entry and body blocks, in preparation
6478 // for conditional creation
6479 BasicBlock *EntryBB = Builder.GetInsertBlock();
6480 Instruction *SplitPos = EntryBB->getTerminator();
6481 if (!isa_and_nonnull<BranchInst>(SplitPos))
6482 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6483 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6484 BasicBlock *FiniBB =
6485 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6486
6487 Builder.SetInsertPoint(EntryBB->getTerminator());
6488 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6489
6490 // generate body
6491 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6492 /* CodeGenIP */ Builder.saveIP()))
6493 return Err;
6494
6495 // emit exit call and do any needed finalization.
6496 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6497 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6498 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6499 "Unexpected control flow graph state!!");
6500 InsertPointOrErrorTy AfterIP =
6501 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6502 if (!AfterIP)
6503 return AfterIP.takeError();
6504 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6505 "Unexpected Control Flow State!");
6507
6508 // If we are skipping the region of a non conditional, remove the exit
6509 // block, and clear the builder's insertion point.
6510 assert(SplitPos->getParent() == ExitBB &&
6511 "Unexpected Insertion point location!");
6512 auto merged = MergeBlockIntoPredecessor(ExitBB);
6513 BasicBlock *ExitPredBB = SplitPos->getParent();
6514 auto InsertBB = merged ? ExitPredBB : ExitBB;
6515 if (!isa_and_nonnull<BranchInst>(SplitPos))
6516 SplitPos->eraseFromParent();
6517 Builder.SetInsertPoint(InsertBB);
6518
6519 return Builder.saveIP();
6520}
6521
6522OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6523 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6524 // if nothing to do, Return current insertion point.
6525 if (!Conditional || !EntryCall)
6526 return Builder.saveIP();
6527
6528 BasicBlock *EntryBB = Builder.GetInsertBlock();
6529 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6530 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6531 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6532
6533 // Emit thenBB and set the Builder's insertion point there for
6534 // body generation next. Place the block after the current block.
6535 Function *CurFn = EntryBB->getParent();
6536 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6537
6538 // Move Entry branch to end of ThenBB, and replace with conditional
6539 // branch (If-stmt)
6540 Instruction *EntryBBTI = EntryBB->getTerminator();
6541 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6542 EntryBBTI->removeFromParent();
6543 Builder.SetInsertPoint(UI);
6544 Builder.Insert(EntryBBTI);
6545 UI->eraseFromParent();
6546 Builder.SetInsertPoint(ThenBB->getTerminator());
6547
6548 // return an insertion point to ExitBB.
6549 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6550}
6551
6552OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6553 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6554 bool HasFinalize) {
6555
6556 Builder.restoreIP(FinIP);
6557
6558 // If there is finalization to do, emit it before the exit call
6559 if (HasFinalize) {
6560 assert(!FinalizationStack.empty() &&
6561 "Unexpected finalization stack state!");
6562
6563 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6564 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6565
6566 if (Error Err = Fi.FiniCB(FinIP))
6567 return Err;
6568
6569 BasicBlock *FiniBB = FinIP.getBlock();
6570 Instruction *FiniBBTI = FiniBB->getTerminator();
6571
6572 // set Builder IP for call creation
6573 Builder.SetInsertPoint(FiniBBTI);
6574 }
6575
6576 if (!ExitCall)
6577 return Builder.saveIP();
6578
6579 // place the Exitcall as last instruction before Finalization block terminator
6580 ExitCall->removeFromParent();
6581 Builder.Insert(ExitCall);
6582
6583 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6584 ExitCall->getIterator());
6585}
6586
6587OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6588 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6589 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6590 if (!IP.isSet())
6591 return IP;
6592
6593 IRBuilder<>::InsertPointGuard IPG(Builder);
6594
6595 // creates the following CFG structure
6596 // OMP_Entry : (MasterAddr != PrivateAddr)?
6597 // F T
6598 // | \
6599 // | copin.not.master
6600 // | /
6601 // v /
6602 // copyin.not.master.end
6603 // |
6604 // v
6605 // OMP.Entry.Next
6606
6607 BasicBlock *OMP_Entry = IP.getBlock();
6608 Function *CurFn = OMP_Entry->getParent();
6609 BasicBlock *CopyBegin =
6610 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6611 BasicBlock *CopyEnd = nullptr;
6612
6613 // If entry block is terminated, split to preserve the branch to following
6614 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6615 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6616 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6617 "copyin.not.master.end");
6618 OMP_Entry->getTerminator()->eraseFromParent();
6619 } else {
6620 CopyEnd =
6621 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6622 }
6623
6624 Builder.SetInsertPoint(OMP_Entry);
6625 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6626 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6627 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6628 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6629
6630 Builder.SetInsertPoint(CopyBegin);
6631 if (BranchtoEnd)
6632 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6633
6634 return Builder.saveIP();
6635}
6636
6637CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6639 std::string Name) {
6640 IRBuilder<>::InsertPointGuard IPG(Builder);
6641 updateToLocation(Loc);
6642
6643 uint32_t SrcLocStrSize;
6644 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6645 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6646 Value *ThreadId = getOrCreateThreadID(Ident);
6647 Value *Args[] = {ThreadId, Size, Allocator};
6648
6649 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6650
6651 return Builder.CreateCall(Fn, Args, Name);
6652}
6653
6654CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6655 Value *Addr, Value *Allocator,
6656 std::string Name) {
6657 IRBuilder<>::InsertPointGuard IPG(Builder);
6658 updateToLocation(Loc);
6659
6660 uint32_t SrcLocStrSize;
6661 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6662 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6663 Value *ThreadId = getOrCreateThreadID(Ident);
6664 Value *Args[] = {ThreadId, Addr, Allocator};
6665 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6666 return Builder.CreateCall(Fn, Args, Name);
6667}
6668
6669CallInst *OpenMPIRBuilder::createOMPInteropInit(
6670 const LocationDescription &Loc, Value *InteropVar,
6671 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6672 Value *DependenceAddress, bool HaveNowaitClause) {
6673 IRBuilder<>::InsertPointGuard IPG(Builder);
6674 updateToLocation(Loc);
6675
6676 uint32_t SrcLocStrSize;
6677 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6678 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6679 Value *ThreadId = getOrCreateThreadID(Ident);
6680 if (Device == nullptr)
6682 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6683 if (NumDependences == nullptr) {
6684 NumDependences = ConstantInt::get(Int32, 0);
6685 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6686 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6687 }
6688 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6689 Value *Args[] = {
6690 Ident, ThreadId, InteropVar, InteropTypeVal,
6691 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6692
6693 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6694
6695 return Builder.CreateCall(Fn, Args);
6696}
6697
6698CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6699 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6700 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6701 IRBuilder<>::InsertPointGuard IPG(Builder);
6702 updateToLocation(Loc);
6703
6704 uint32_t SrcLocStrSize;
6705 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6706 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6707 Value *ThreadId = getOrCreateThreadID(Ident);
6708 if (Device == nullptr)
6710 if (NumDependences == nullptr) {
6711 NumDependences = ConstantInt::get(Int32, 0);
6712 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6713 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6714 }
6715 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6716 Value *Args[] = {
6717 Ident, ThreadId, InteropVar, Device,
6718 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6719
6720 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6721
6722 return Builder.CreateCall(Fn, Args);
6723}
6724
6725CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6726 Value *InteropVar, Value *Device,
6727 Value *NumDependences,
6728 Value *DependenceAddress,
6729 bool HaveNowaitClause) {
6730 IRBuilder<>::InsertPointGuard IPG(Builder);
6731 updateToLocation(Loc);
6732 uint32_t SrcLocStrSize;
6733 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6734 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6735 Value *ThreadId = getOrCreateThreadID(Ident);
6736 if (Device == nullptr)
6738 if (NumDependences == nullptr) {
6739 NumDependences = ConstantInt::get(Int32, 0);
6740 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6741 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6742 }
6743 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6744 Value *Args[] = {
6745 Ident, ThreadId, InteropVar, Device,
6746 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6747
6748 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6749
6750 return Builder.CreateCall(Fn, Args);
6751}
6752
6753CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6754 const LocationDescription &Loc, llvm::Value *Pointer,
6755 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6756 IRBuilder<>::InsertPointGuard IPG(Builder);
6757 updateToLocation(Loc);
6758
6759 uint32_t SrcLocStrSize;
6760 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6761 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6762 Value *ThreadId = getOrCreateThreadID(Ident);
6763 Constant *ThreadPrivateCache =
6764 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6765 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6766
6767 Function *Fn =
6768 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6769
6770 return Builder.CreateCall(Fn, Args);
6771}
6772
6773OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6774 const LocationDescription &Loc,
6775 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6776 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6777 "expected num_threads and num_teams to be specified");
6778
6779 if (!updateToLocation(Loc))
6780 return Loc.IP;
6781
6782 uint32_t SrcLocStrSize;
6783 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6784 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6785 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6786 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6787 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6788 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6789 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6790
6791 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6792 Function *Kernel = DebugKernelWrapper;
6793
6794 // We need to strip the debug prefix to get the correct kernel name.
6795 StringRef KernelName = Kernel->getName();
6796 const std::string DebugPrefix = "_debug__";
6797 if (KernelName.ends_with(DebugPrefix)) {
6798 KernelName = KernelName.drop_back(DebugPrefix.length());
6799 Kernel = M.getFunction(KernelName);
6800 assert(Kernel && "Expected the real kernel to exist");
6801 }
6802
6803 // Manifest the launch configuration in the metadata matching the kernel
6804 // environment.
6805 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6806 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6807
6808 // If MaxThreads not set, select the maximum between the default workgroup
6809 // size and the MinThreads value.
6810 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6811 if (MaxThreadsVal < 0)
6812 MaxThreadsVal = std::max(
6813 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6814
6815 if (MaxThreadsVal > 0)
6816 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6817
6818 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6820 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6821 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6822 Constant *ReductionDataSize =
6823 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6824 Constant *ReductionBufferLength =
6825 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6826
6827 Function *Fn = getOrCreateRuntimeFunctionPtr(
6828 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6829 const DataLayout &DL = Fn->getDataLayout();
6830
6831 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6832 Constant *DynamicEnvironmentInitializer =
6833 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6834 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6835 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6836 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6837 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6838 DL.getDefaultGlobalsAddressSpace());
6839 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6840
6841 Constant *DynamicEnvironment =
6842 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6843 ? DynamicEnvironmentGV
6844 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6845 DynamicEnvironmentPtr);
6846
6847 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6848 ConfigurationEnvironment, {
6849 UseGenericStateMachineVal,
6850 MayUseNestedParallelismVal,
6851 IsSPMDVal,
6852 MinThreads,
6853 MaxThreads,
6854 MinTeams,
6855 MaxTeams,
6856 ReductionDataSize,
6857 ReductionBufferLength,
6858 });
6859 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6860 KernelEnvironment, {
6861 ConfigurationEnvironmentInitializer,
6862 Ident,
6863 DynamicEnvironment,
6864 });
6865 std::string KernelEnvironmentName =
6866 (KernelName + "_kernel_environment").str();
6867 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6868 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6869 KernelEnvironmentInitializer, KernelEnvironmentName,
6870 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6871 DL.getDefaultGlobalsAddressSpace());
6872 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6873
6874 Constant *KernelEnvironment =
6875 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6876 ? KernelEnvironmentGV
6877 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6878 KernelEnvironmentPtr);
6879 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6880 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6881 KernelLaunchEnvironment =
6882 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6883 ? KernelLaunchEnvironment
6884 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6885 KernelLaunchEnvParamTy);
6886 CallInst *ThreadKind =
6887 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6888
6889 Value *ExecUserCode = Builder.CreateICmpEQ(
6890 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6891 "exec_user_code");
6892
6893 // ThreadKind = __kmpc_target_init(...)
6894 // if (ThreadKind == -1)
6895 // user_code
6896 // else
6897 // return;
6898
6899 auto *UI = Builder.CreateUnreachable();
6900 BasicBlock *CheckBB = UI->getParent();
6901 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6902
6903 BasicBlock *WorkerExitBB = BasicBlock::Create(
6904 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6905 Builder.SetInsertPoint(WorkerExitBB);
6906 Builder.CreateRetVoid();
6907
6908 auto *CheckBBTI = CheckBB->getTerminator();
6909 Builder.SetInsertPoint(CheckBBTI);
6910 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6911
6912 CheckBBTI->eraseFromParent();
6913 UI->eraseFromParent();
6914
6915 // Continue in the "user_code" block, see diagram above and in
6916 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6917 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6918}
6919
6920void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6921 int32_t TeamsReductionDataSize,
6922 int32_t TeamsReductionBufferLength) {
6923 if (!updateToLocation(Loc))
6924 return;
6925
6926 Function *Fn = getOrCreateRuntimeFunctionPtr(
6927 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6928
6929 Builder.CreateCall(Fn, {});
6930
6931 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6932 return;
6933
6934 Function *Kernel = Builder.GetInsertBlock()->getParent();
6935 // We need to strip the debug prefix to get the correct kernel name.
6936 StringRef KernelName = Kernel->getName();
6937 const std::string DebugPrefix = "_debug__";
6938 if (KernelName.ends_with(DebugPrefix))
6939 KernelName = KernelName.drop_back(DebugPrefix.length());
6940 auto *KernelEnvironmentGV =
6941 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6942 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6943 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6944 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6945 KernelEnvironmentInitializer,
6946 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6947 NewInitializer = ConstantFoldInsertValueInstruction(
6948 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6949 {0, 8});
6950 KernelEnvironmentGV->setInitializer(NewInitializer);
6951}
6952
6953static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6954 bool Min) {
6955 if (Kernel.hasFnAttribute(Name)) {
6956 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6957 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6958 }
6959 Kernel.addFnAttr(Name, llvm::utostr(Value));
6960}
6961
6962std::pair<int32_t, int32_t>
6963OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6964 int32_t ThreadLimit =
6965 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6966
6967 if (T.isAMDGPU()) {
6968 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6969 if (!Attr.isValid() || !Attr.isStringAttribute())
6970 return {0, ThreadLimit};
6971 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6972 int32_t LB, UB;
6973 if (!llvm::to_integer(UBStr, UB, 10))
6974 return {0, ThreadLimit};
6975 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6976 if (!llvm::to_integer(LBStr, LB, 10))
6977 return {0, UB};
6978 return {LB, UB};
6979 }
6980
6981 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6982 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6983 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6984 }
6985 return {0, ThreadLimit};
6986}
6987
6988void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6989 Function &Kernel, int32_t LB,
6990 int32_t UB) {
6991 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6992
6993 if (T.isAMDGPU()) {
6994 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6995 llvm::utostr(LB) + "," + llvm::utostr(UB));
6996 return;
6997 }
6998
6999 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7000}
7001
7002std::pair<int32_t, int32_t>
7003OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
7004 // TODO: Read from backend annotations if available.
7005 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7006}
7007
7008void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7009 int32_t LB, int32_t UB) {
7010 if (T.isNVPTX())
7011 if (UB > 0)
7012 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7013 if (T.isAMDGPU())
7014 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7015
7016 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7017}
7018
7019void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7020 Function *OutlinedFn) {
7021 if (Config.isTargetDevice()) {
7023 // TODO: Determine if DSO local can be set to true.
7024 OutlinedFn->setDSOLocal(false);
7026 if (T.isAMDGCN())
7028 else if (T.isNVPTX())
7030 else if (T.isSPIRV())
7032 }
7033}
7034
7035Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7036 StringRef EntryFnIDName) {
7037 if (Config.isTargetDevice()) {
7038 assert(OutlinedFn && "The outlined function must exist if embedded");
7039 return OutlinedFn;
7040 }
7041
7042 return new GlobalVariable(
7043 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7044 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7045}
7046
7047Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7048 StringRef EntryFnName) {
7049 if (OutlinedFn)
7050 return OutlinedFn;
7051
7052 assert(!M.getGlobalVariable(EntryFnName, true) &&
7053 "Named kernel already exists?");
7054 return new GlobalVariable(
7055 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7056 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7057}
7058
7059Error OpenMPIRBuilder::emitTargetRegionFunction(
7060 TargetRegionEntryInfo &EntryInfo,
7061 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7062 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7063
7064 SmallString<64> EntryFnName;
7065 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7066
7067 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7068 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7069 if (!CBResult)
7070 return CBResult.takeError();
7071 OutlinedFn = *CBResult;
7072 } else {
7073 OutlinedFn = nullptr;
7074 }
7075
7076 // If this target outline function is not an offload entry, we don't need to
7077 // register it. This may be in the case of a false if clause, or if there are
7078 // no OpenMP targets.
7079 if (!IsOffloadEntry)
7080 return Error::success();
7081
7082 std::string EntryFnIDName =
7083 Config.isTargetDevice()
7084 ? std::string(EntryFnName)
7085 : createPlatformSpecificName({EntryFnName, "region_id"});
7086
7087 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7088 EntryFnName, EntryFnIDName);
7089 return Error::success();
7090}
7091
7092Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7093 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7094 StringRef EntryFnName, StringRef EntryFnIDName) {
7095 if (OutlinedFn)
7096 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7097 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7098 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7099 OffloadInfoManager.registerTargetRegionEntryInfo(
7100 EntryInfo, EntryAddr, OutlinedFnID,
7101 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7102 return OutlinedFnID;
7103}
7104
7105OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7106 const LocationDescription &Loc, InsertPointTy AllocaIP,
7107 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7108 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7109 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7110 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7111 BodyGenTy BodyGenType)>
7112 BodyGenCB,
7113 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7114 if (!updateToLocation(Loc))
7115 return InsertPointTy();
7116
7117 Builder.restoreIP(CodeGenIP);
7118 // Disable TargetData CodeGen on Device pass.
7119 if (Config.IsTargetDevice.value_or(false)) {
7120 if (BodyGenCB) {
7121 InsertPointOrErrorTy AfterIP =
7122 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7123 if (!AfterIP)
7124 return AfterIP.takeError();
7125 Builder.restoreIP(*AfterIP);
7126 }
7127 return Builder.saveIP();
7128 }
7129
7130 bool IsStandAlone = !BodyGenCB;
7131 MapInfosTy *MapInfo;
7132 // Generate the code for the opening of the data environment. Capture all the
7133 // arguments of the runtime call by reference because they are used in the
7134 // closing of the region.
7135 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7136 InsertPointTy CodeGenIP) -> Error {
7137 MapInfo = &GenMapInfoCB(Builder.saveIP());
7138 if (Error Err = emitOffloadingArrays(
7139 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7140 /*IsNonContiguous=*/true, DeviceAddrCB))
7141 return Err;
7142
7143 TargetDataRTArgs RTArgs;
7144 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7145
7146 // Emit the number of elements in the offloading arrays.
7147 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7148
7149 // Source location for the ident struct
7150 if (!SrcLocInfo) {
7151 uint32_t SrcLocStrSize;
7152 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7153 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7154 }
7155
7156 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7157 SrcLocInfo, DeviceID,
7158 PointerNum, RTArgs.BasePointersArray,
7159 RTArgs.PointersArray, RTArgs.SizesArray,
7160 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7161 RTArgs.MappersArray};
7162
7163 if (IsStandAlone) {
7164 assert(MapperFunc && "MapperFunc missing for standalone target data");
7165
7166 auto TaskBodyCB = [&](Value *, Value *,
7168 if (Info.HasNoWait) {
7169 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7173 }
7174
7175 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7176 OffloadingArgs);
7177
7178 if (Info.HasNoWait) {
7179 BasicBlock *OffloadContBlock =
7180 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7181 Function *CurFn = Builder.GetInsertBlock()->getParent();
7182 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7183 Builder.restoreIP(Builder.saveIP());
7184 }
7185 return Error::success();
7186 };
7187
7188 bool RequiresOuterTargetTask = Info.HasNoWait;
7189 if (!RequiresOuterTargetTask)
7190 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7191 /*TargetTaskAllocaIP=*/{}));
7192 else
7193 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7194 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7195 } else {
7196 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7197 omp::OMPRTL___tgt_target_data_begin_mapper);
7198
7199 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7200
7201 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7202 if (isa<AllocaInst>(DeviceMap.second.second)) {
7203 auto *LI =
7204 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7205 Builder.CreateStore(LI, DeviceMap.second.second);
7206 }
7207 }
7208
7209 // If device pointer privatization is required, emit the body of the
7210 // region here. It will have to be duplicated: with and without
7211 // privatization.
7212 InsertPointOrErrorTy AfterIP =
7213 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7214 if (!AfterIP)
7215 return AfterIP.takeError();
7216 Builder.restoreIP(*AfterIP);
7217 }
7218 return Error::success();
7219 };
7220
7221 // If we need device pointer privatization, we need to emit the body of the
7222 // region with no privatization in the 'else' branch of the conditional.
7223 // Otherwise, we don't have to do anything.
7224 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7225 InsertPointTy CodeGenIP) -> Error {
7226 InsertPointOrErrorTy AfterIP =
7227 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7228 if (!AfterIP)
7229 return AfterIP.takeError();
7230 Builder.restoreIP(*AfterIP);
7231 return Error::success();
7232 };
7233
7234 // Generate code for the closing of the data region.
7235 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7236 TargetDataRTArgs RTArgs;
7237 Info.EmitDebug = !MapInfo->Names.empty();
7238 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7239
7240 // Emit the number of elements in the offloading arrays.
7241 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7242
7243 // Source location for the ident struct
7244 if (!SrcLocInfo) {
7245 uint32_t SrcLocStrSize;
7246 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7247 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7248 }
7249
7250 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7251 PointerNum, RTArgs.BasePointersArray,
7252 RTArgs.PointersArray, RTArgs.SizesArray,
7253 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7254 RTArgs.MappersArray};
7255 Function *EndMapperFunc =
7256 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7257
7258 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7259 return Error::success();
7260 };
7261
7262 // We don't have to do anything to close the region if the if clause evaluates
7263 // to false.
7264 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7265 return Error::success();
7266 };
7267
7268 Error Err = [&]() -> Error {
7269 if (BodyGenCB) {
7270 Error Err = [&]() {
7271 if (IfCond)
7272 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7273 return BeginThenGen(AllocaIP, Builder.saveIP());
7274 }();
7275
7276 if (Err)
7277 return Err;
7278
7279 // If we don't require privatization of device pointers, we emit the body
7280 // in between the runtime calls. This avoids duplicating the body code.
7281 InsertPointOrErrorTy AfterIP =
7282 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7283 if (!AfterIP)
7284 return AfterIP.takeError();
7285 restoreIPandDebugLoc(Builder, *AfterIP);
7286
7287 if (IfCond)
7288 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7289 return EndThenGen(AllocaIP, Builder.saveIP());
7290 }
7291 if (IfCond)
7292 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7293 return BeginThenGen(AllocaIP, Builder.saveIP());
7294 }();
7295
7296 if (Err)
7297 return Err;
7298
7299 return Builder.saveIP();
7300}
7301
7303OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7304 bool IsGPUDistribute) {
7305 assert((IVSize == 32 || IVSize == 64) &&
7306 "IV size is not compatible with the omp runtime");
7308 if (IsGPUDistribute)
7309 Name = IVSize == 32
7310 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7311 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7312 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7313 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7314 else
7315 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7316 : omp::OMPRTL___kmpc_for_static_init_4u)
7317 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7318 : omp::OMPRTL___kmpc_for_static_init_8u);
7319
7320 return getOrCreateRuntimeFunction(M, Name);
7321}
7322
7323FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7324 bool IVSigned) {
7325 assert((IVSize == 32 || IVSize == 64) &&
7326 "IV size is not compatible with the omp runtime");
7327 RuntimeFunction Name = IVSize == 32
7328 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7329 : omp::OMPRTL___kmpc_dispatch_init_4u)
7330 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7331 : omp::OMPRTL___kmpc_dispatch_init_8u);
7332
7333 return getOrCreateRuntimeFunction(M, Name);
7334}
7335
7336FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7337 bool IVSigned) {
7338 assert((IVSize == 32 || IVSize == 64) &&
7339 "IV size is not compatible with the omp runtime");
7340 RuntimeFunction Name = IVSize == 32
7341 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7342 : omp::OMPRTL___kmpc_dispatch_next_4u)
7343 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7344 : omp::OMPRTL___kmpc_dispatch_next_8u);
7345
7346 return getOrCreateRuntimeFunction(M, Name);
7347}
7348
7349FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7350 bool IVSigned) {
7351 assert((IVSize == 32 || IVSize == 64) &&
7352 "IV size is not compatible with the omp runtime");
7353 RuntimeFunction Name = IVSize == 32
7354 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7355 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7356 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7357 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7358
7359 return getOrCreateRuntimeFunction(M, Name);
7360}
7361
7362FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7363 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7364}
7365
7367 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7368 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7369
7370 DISubprogram *NewSP = Func->getSubprogram();
7371 if (!NewSP)
7372 return;
7373
7375
7376 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7377 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7378 // Only use cached variable if the arg number matches. This is important
7379 // so that DIVariable created for privatized variables are not discarded.
7380 if (NewVar && (arg == NewVar->getArg()))
7381 return NewVar;
7382
7384 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7385 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7386 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7387 return NewVar;
7388 };
7389
7390 auto UpdateDebugRecord = [&](auto *DR) {
7391 DILocalVariable *OldVar = DR->getVariable();
7392 unsigned ArgNo = 0;
7393 for (auto Loc : DR->location_ops()) {
7394 auto Iter = ValueReplacementMap.find(Loc);
7395 if (Iter != ValueReplacementMap.end()) {
7396 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7397 ArgNo = std::get<1>(Iter->second) + 1;
7398 }
7399 }
7400 if (ArgNo != 0)
7401 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7402 };
7403
7404 // The location and scope of variable intrinsics and records still point to
7405 // the parent function of the target region. Update them.
7406 for (Instruction &I : instructions(Func)) {
7408 "Unexpected debug intrinsic");
7409 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7410 UpdateDebugRecord(&DVR);
7411 }
7412 // An extra argument is passed to the device. Create the debug data for it.
7413 if (OMPBuilder.Config.isTargetDevice()) {
7414 DICompileUnit *CU = NewSP->getUnit();
7415 Module *M = Func->getParent();
7416 DIBuilder DB(*M, true, CU);
7417 DIType *VoidPtrTy =
7418 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7419 DILocalVariable *Var = DB.createParameterVariable(
7420 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7421 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7422 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7423 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7424 &(*Func->begin()));
7425 }
7426}
7427
7429 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7430 return cast<Operator>(V)->getOperand(0);
7431 return V;
7432}
7433
7435 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7436 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7437 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7438 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7439 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7440 SmallVector<Type *> ParameterTypes;
7441 if (OMPBuilder.Config.isTargetDevice()) {
7442 // Add the "implicit" runtime argument we use to provide launch specific
7443 // information for target devices.
7444 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7445 ParameterTypes.push_back(Int8PtrTy);
7446
7447 // All parameters to target devices are passed as pointers
7448 // or i64. This assumes 64-bit address spaces/pointers.
7449 for (auto &Arg : Inputs)
7450 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7451 ? Arg->getType()
7452 : Type::getInt64Ty(Builder.getContext()));
7453 } else {
7454 for (auto &Arg : Inputs)
7455 ParameterTypes.push_back(Arg->getType());
7456 }
7457
7458 auto BB = Builder.GetInsertBlock();
7459 auto M = BB->getModule();
7460 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7461 /*isVarArg*/ false);
7462 auto Func =
7463 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7464
7465 // Forward target-cpu and target-features function attributes from the
7466 // original function to the new outlined function.
7467 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7468
7469 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7470 if (TargetCpuAttr.isStringAttribute())
7471 Func->addFnAttr(TargetCpuAttr);
7472
7473 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7474 if (TargetFeaturesAttr.isStringAttribute())
7475 Func->addFnAttr(TargetFeaturesAttr);
7476
7477 if (OMPBuilder.Config.isTargetDevice()) {
7478 Value *ExecMode =
7479 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7480 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7481 }
7482
7483 // Save insert point.
7484 IRBuilder<>::InsertPointGuard IPG(Builder);
7485 // We will generate the entries in the outlined function but the debug
7486 // location may still be pointing to the parent function. Reset it now.
7487 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7488
7489 // Generate the region into the function.
7490 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7491 Builder.SetInsertPoint(EntryBB);
7492
7493 // Insert target init call in the device compilation pass.
7494 if (OMPBuilder.Config.isTargetDevice())
7495 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7496
7497 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7498
7499 // As we embed the user code in the middle of our target region after we
7500 // generate entry code, we must move what allocas we can into the entry
7501 // block to avoid possible breaking optimisations for device
7502 if (OMPBuilder.Config.isTargetDevice())
7503 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7504
7505 // Insert target deinit call in the device compilation pass.
7506 BasicBlock *OutlinedBodyBB =
7507 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7508 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7509 Builder.saveIP(),
7510 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7511 if (!AfterIP)
7512 return AfterIP.takeError();
7513 Builder.restoreIP(*AfterIP);
7514 if (OMPBuilder.Config.isTargetDevice())
7515 OMPBuilder.createTargetDeinit(Builder);
7516
7517 // Insert return instruction.
7518 Builder.CreateRetVoid();
7519
7520 // New Alloca IP at entry point of created device function.
7521 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7522 auto AllocaIP = Builder.saveIP();
7523
7524 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7525
7526 // Skip the artificial dyn_ptr on the device.
7527 const auto &ArgRange =
7528 OMPBuilder.Config.isTargetDevice()
7529 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7530 : Func->args();
7531
7533
7534 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7535 // Things like GEP's can come in the form of Constants. Constants and
7536 // ConstantExpr's do not have access to the knowledge of what they're
7537 // contained in, so we must dig a little to find an instruction so we
7538 // can tell if they're used inside of the function we're outlining. We
7539 // also replace the original constant expression with a new instruction
7540 // equivalent; an instruction as it allows easy modification in the
7541 // following loop, as we can now know the constant (instruction) is
7542 // owned by our target function and replaceUsesOfWith can now be invoked
7543 // on it (cannot do this with constants it seems). A brand new one also
7544 // allows us to be cautious as it is perhaps possible the old expression
7545 // was used inside of the function but exists and is used externally
7546 // (unlikely by the nature of a Constant, but still).
7547 // NOTE: We cannot remove dead constants that have been rewritten to
7548 // instructions at this stage, we run the risk of breaking later lowering
7549 // by doing so as we could still be in the process of lowering the module
7550 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7551 // constants we have created rewritten versions of.
7552 if (auto *Const = dyn_cast<Constant>(Input))
7553 convertUsersOfConstantsToInstructions(Const, Func, false);
7554
7555 // Collect users before iterating over them to avoid invalidating the
7556 // iteration in case a user uses Input more than once (e.g. a call
7557 // instruction).
7558 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7559 // Collect all the instructions
7561 if (auto *Instr = dyn_cast<Instruction>(User))
7562 if (Instr->getFunction() == Func)
7563 Instr->replaceUsesOfWith(Input, InputCopy);
7564 };
7565
7566 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7567
7568 // Rewrite uses of input valus to parameters.
7569 for (auto InArg : zip(Inputs, ArgRange)) {
7570 Value *Input = std::get<0>(InArg);
7571 Argument &Arg = std::get<1>(InArg);
7572 Value *InputCopy = nullptr;
7573
7574 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7575 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7576 if (!AfterIP)
7577 return AfterIP.takeError();
7578 Builder.restoreIP(*AfterIP);
7579 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7580
7581 // In certain cases a Global may be set up for replacement, however, this
7582 // Global may be used in multiple arguments to the kernel, just segmented
7583 // apart, for example, if we have a global array, that is sectioned into
7584 // multiple mappings (technically not legal in OpenMP, but there is a case
7585 // in Fortran for Common Blocks where this is neccesary), we will end up
7586 // with GEP's into this array inside the kernel, that refer to the Global
7587 // but are technically seperate arguments to the kernel for all intents and
7588 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7589 // index, it will fold into an referal to the Global, if we then encounter
7590 // this folded GEP during replacement all of the references to the
7591 // Global in the kernel will be replaced with the argument we have generated
7592 // that corresponds to it, including any other GEP's that refer to the
7593 // Global that may be other arguments. This will invalidate all of the other
7594 // preceding mapped arguments that refer to the same global that may be
7595 // seperate segments. To prevent this, we defer global processing until all
7596 // other processing has been performed.
7599 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7600 continue;
7601 }
7602
7604 continue;
7605
7606 ReplaceValue(Input, InputCopy, Func);
7607 }
7608
7609 // Replace all of our deferred Input values, currently just Globals.
7610 for (auto Deferred : DeferredReplacement)
7611 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7612
7613 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7614 ValueReplacementMap);
7615 return Func;
7616}
7617/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7618/// of pointers containing shared data between the parent task and the created
7619/// task.
7620static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7621 IRBuilderBase &Builder,
7622 Value *TaskWithPrivates,
7623 Type *TaskWithPrivatesTy) {
7624
7625 Type *TaskTy = OMPIRBuilder.Task;
7626 LLVMContext &Ctx = Builder.getContext();
7627 Value *TaskT =
7628 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7629 Value *Shareds = TaskT;
7630 // TaskWithPrivatesTy can be one of the following
7631 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7632 // %struct.privates }
7633 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7634 //
7635 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7636 // its first member has to be the task descriptor. TaskTy is the type of the
7637 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7638 // first member of TaskT, gives us the pointer to shared data.
7639 if (TaskWithPrivatesTy != TaskTy)
7640 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7641 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7642}
7643/// Create an entry point for a target task with the following.
7644/// It'll have the following signature
7645/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7646/// This function is called from emitTargetTask once the
7647/// code to launch the target kernel has been outlined already.
7648/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7649/// into the task structure so that the deferred target task can access this
7650/// data even after the stack frame of the generating task has been rolled
7651/// back. Offloading arrays contain base pointers, pointers, sizes etc
7652/// of the data that the target kernel will access. These in effect are the
7653/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7655 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7656 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7657 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7658
7659 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7660 // This is because PrivatesTy is the type of the structure in which
7661 // we pass the offloading arrays to the deferred target task.
7662 assert((!NumOffloadingArrays || PrivatesTy) &&
7663 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7664 "to privatize");
7665
7666 Module &M = OMPBuilder.M;
7667 // KernelLaunchFunction is the target launch function, i.e.
7668 // the function that sets up kernel arguments and calls
7669 // __tgt_target_kernel to launch the kernel on the device.
7670 //
7671 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7672
7673 // StaleCI is the CallInst which is the call to the outlined
7674 // target kernel launch function. If there are local live-in values
7675 // that the outlined function uses then these are aggregated into a structure
7676 // which is passed as the second argument. If there are no local live-in
7677 // values or if all values used by the outlined kernel are global variables,
7678 // then there's only one argument, the threadID. So, StaleCI can be
7679 //
7680 // %structArg = alloca { ptr, ptr }, align 8
7681 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7682 // store ptr %20, ptr %gep_, align 8
7683 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7684 // store ptr %21, ptr %gep_8, align 8
7685 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7686 //
7687 // OR
7688 //
7689 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7690 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7691 StaleCI->getIterator());
7692
7693 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7694
7695 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7696 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7697 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7698
7699 auto ProxyFnTy =
7700 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7701 /* isVarArg */ false);
7702 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7703 ".omp_target_task_proxy_func",
7704 Builder.GetInsertBlock()->getModule());
7705 Value *ThreadId = ProxyFn->getArg(0);
7706 Value *TaskWithPrivates = ProxyFn->getArg(1);
7707 ThreadId->setName("thread.id");
7708 TaskWithPrivates->setName("task");
7709
7710 bool HasShareds = SharedArgsOperandNo > 0;
7711 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7712 BasicBlock *EntryBB =
7713 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7714 Builder.SetInsertPoint(EntryBB);
7715
7716 SmallVector<Value *> KernelLaunchArgs;
7717 KernelLaunchArgs.reserve(StaleCI->arg_size());
7718 KernelLaunchArgs.push_back(ThreadId);
7719
7720 if (HasOffloadingArrays) {
7721 assert(TaskTy != TaskWithPrivatesTy &&
7722 "If there are offloading arrays to pass to the target"
7723 "TaskTy cannot be the same as TaskWithPrivatesTy");
7724 (void)TaskTy;
7725 Value *Privates =
7726 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7727 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7728 KernelLaunchArgs.push_back(
7729 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7730 }
7731
7732 if (HasShareds) {
7733 auto *ArgStructAlloca =
7734 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7735 assert(ArgStructAlloca &&
7736 "Unable to find the alloca instruction corresponding to arguments "
7737 "for extracted function");
7738 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7739
7740 AllocaInst *NewArgStructAlloca =
7741 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7742
7743 Value *SharedsSize =
7744 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7745
7747 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7748
7749 Builder.CreateMemCpy(
7750 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7751 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7752 KernelLaunchArgs.push_back(NewArgStructAlloca);
7753 }
7754 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7755 Builder.CreateRetVoid();
7756 return ProxyFn;
7757}
7759
7760 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7761 return GEP->getSourceElementType();
7762 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7763 return Alloca->getAllocatedType();
7764
7765 llvm_unreachable("Unhandled Instruction type");
7766 return nullptr;
7767}
7768// This function returns a struct that has at most two members.
7769// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7770// descriptor. The second member, if needed, is a struct containing arrays
7771// that need to be passed to the offloaded target kernel. For example,
7772// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7773// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7774// respectively, then the types created by this function are
7775//
7776// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7777// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7778// %struct.privates }
7779// %struct.task_with_privates is returned by this function.
7780// If there aren't any offloading arrays to pass to the target kernel,
7781// %struct.kmp_task_ompbuilder_t is returned.
7782static StructType *
7783createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7784 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7785
7786 if (OffloadingArraysToPrivatize.empty())
7787 return OMPIRBuilder.Task;
7788
7789 SmallVector<Type *, 4> StructFieldTypes;
7790 for (Value *V : OffloadingArraysToPrivatize) {
7791 assert(V->getType()->isPointerTy() &&
7792 "Expected pointer to array to privatize. Got a non-pointer value "
7793 "instead");
7794 Type *ArrayTy = getOffloadingArrayType(V);
7795 assert(ArrayTy && "ArrayType cannot be nullptr");
7796 StructFieldTypes.push_back(ArrayTy);
7797 }
7798 StructType *PrivatesStructTy =
7799 StructType::create(StructFieldTypes, "struct.privates");
7800 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7801 "struct.task_with_privates");
7802}
7804 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7805 TargetRegionEntryInfo &EntryInfo,
7806 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7807 Function *&OutlinedFn, Constant *&OutlinedFnID,
7809 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7810 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7811
7812 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7813 [&](StringRef EntryFnName) {
7814 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7815 EntryFnName, Inputs, CBFunc,
7816 ArgAccessorFuncCB);
7817 };
7818
7819 return OMPBuilder.emitTargetRegionFunction(
7820 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7821 OutlinedFnID);
7822}
7823
7824OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7825 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7826 OpenMPIRBuilder::InsertPointTy AllocaIP,
7828 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7829
7830 // The following explains the code-gen scenario for the `target` directive. A
7831 // similar scneario is followed for other device-related directives (e.g.
7832 // `target enter data`) but in similar fashion since we only need to emit task
7833 // that encapsulates the proper runtime call.
7834 //
7835 // When we arrive at this function, the target region itself has been
7836 // outlined into the function OutlinedFn.
7837 // So at ths point, for
7838 // --------------------------------------------------------------
7839 // void user_code_that_offloads(...) {
7840 // omp target depend(..) map(from:a) map(to:b) private(i)
7841 // do i = 1, 10
7842 // a(i) = b(i) + n
7843 // }
7844 //
7845 // --------------------------------------------------------------
7846 //
7847 // we have
7848 //
7849 // --------------------------------------------------------------
7850 //
7851 // void user_code_that_offloads(...) {
7852 // %.offload_baseptrs = alloca [2 x ptr], align 8
7853 // %.offload_ptrs = alloca [2 x ptr], align 8
7854 // %.offload_mappers = alloca [2 x ptr], align 8
7855 // ;; target region has been outlined and now we need to
7856 // ;; offload to it via a target task.
7857 // }
7858 // void outlined_device_function(ptr a, ptr b, ptr n) {
7859 // n = *n_ptr;
7860 // do i = 1, 10
7861 // a(i) = b(i) + n
7862 // }
7863 //
7864 // We have to now do the following
7865 // (i) Make an offloading call to outlined_device_function using the OpenMP
7866 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7867 // emitted by emitKernelLaunch
7868 // (ii) Create a task entry point function that calls kernel_launch_function
7869 // and is the entry point for the target task. See
7870 // '@.omp_target_task_proxy_func in the pseudocode below.
7871 // (iii) Create a task with the task entry point created in (ii)
7872 //
7873 // That is we create the following
7874 // struct task_with_privates {
7875 // struct kmp_task_ompbuilder_t task_struct;
7876 // struct privates {
7877 // [2 x ptr] ; baseptrs
7878 // [2 x ptr] ; ptrs
7879 // [2 x i64] ; sizes
7880 // }
7881 // }
7882 // void user_code_that_offloads(...) {
7883 // %.offload_baseptrs = alloca [2 x ptr], align 8
7884 // %.offload_ptrs = alloca [2 x ptr], align 8
7885 // %.offload_sizes = alloca [2 x i64], align 8
7886 //
7887 // %structArg = alloca { ptr, ptr, ptr }, align 8
7888 // %strucArg[0] = a
7889 // %strucArg[1] = b
7890 // %strucArg[2] = &n
7891 //
7892 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7893 // sizeof(kmp_task_ompbuilder_t),
7894 // sizeof(structArg),
7895 // @.omp_target_task_proxy_func,
7896 // ...)
7897 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7898 // sizeof(structArg))
7899 // memcpy(target_task_with_privates->privates->baseptrs,
7900 // offload_baseptrs, sizeof(offload_baseptrs)
7901 // memcpy(target_task_with_privates->privates->ptrs,
7902 // offload_ptrs, sizeof(offload_ptrs)
7903 // memcpy(target_task_with_privates->privates->sizes,
7904 // offload_sizes, sizeof(offload_sizes)
7905 // dependencies_array = ...
7906 // ;; if nowait not present
7907 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7908 // call @__kmpc_omp_task_begin_if0(...)
7909 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7910 // %target_task_with_privates)
7911 // call @__kmpc_omp_task_complete_if0(...)
7912 // }
7913 //
7914 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7915 // ptr %task) {
7916 // %structArg = alloca {ptr, ptr, ptr}
7917 // %task_ptr = getelementptr(%task, 0, 0)
7918 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7919 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7920 //
7921 // %offloading_arrays = getelementptr(%task, 0, 1)
7922 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7923 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7924 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7925 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7926 // %offload_sizes, %structArg)
7927 // }
7928 //
7929 // We need the proxy function because the signature of the task entry point
7930 // expected by kmpc_omp_task is always the same and will be different from
7931 // that of the kernel_launch function.
7932 //
7933 // kernel_launch_function is generated by emitKernelLaunch and has the
7934 // always_inline attribute. For this example, it'll look like so:
7935 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7936 // %offload_sizes, %structArg) alwaysinline {
7937 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7938 // ; load aggregated data from %structArg
7939 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7940 // ; offload_sizes
7941 // call i32 @__tgt_target_kernel(...,
7942 // outlined_device_function,
7943 // ptr %kernel_args)
7944 // }
7945 // void outlined_device_function(ptr a, ptr b, ptr n) {
7946 // n = *n_ptr;
7947 // do i = 1, 10
7948 // a(i) = b(i) + n
7949 // }
7950 //
7951 BasicBlock *TargetTaskBodyBB =
7952 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7953 BasicBlock *TargetTaskAllocaBB =
7954 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7955
7956 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7957 TargetTaskAllocaBB->begin());
7958 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7959
7960 OutlineInfo OI;
7961 OI.EntryBB = TargetTaskAllocaBB;
7962 OI.OuterAllocaBB = AllocaIP.getBlock();
7963
7964 // Add the thread ID argument.
7966 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7967 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7968
7969 // Generate the task body which will subsequently be outlined.
7970 Builder.restoreIP(TargetTaskBodyIP);
7971 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7972 return Err;
7973
7974 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7975 // it is given. These blocks are enumerated by
7976 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7977 // to be outside the region. In other words, OI.ExitBlock is expected to be
7978 // the start of the region after the outlining. We used to set OI.ExitBlock
7979 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7980 // except when the task body is a single basic block. In that case,
7981 // OI.ExitBlock is set to the single task body block and will get left out of
7982 // the outlining process. So, simply create a new empty block to which we
7983 // uncoditionally branch from where TaskBodyCB left off
7984 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7985 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7986 /*IsFinished=*/true);
7987
7988 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7989 bool NeedsTargetTask = HasNoWait && DeviceID;
7990 if (NeedsTargetTask) {
7991 for (auto *V :
7992 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7993 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7994 RTArgs.SizesArray}) {
7996 OffloadingArraysToPrivatize.push_back(V);
7997 OI.ExcludeArgsFromAggregate.push_back(V);
7998 }
7999 }
8000 }
8001 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8002 DeviceID, OffloadingArraysToPrivatize](
8003 Function &OutlinedFn) mutable {
8004 assert(OutlinedFn.hasOneUse() &&
8005 "there must be a single user for the outlined function");
8006
8007 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8008
8009 // The first argument of StaleCI is always the thread id.
8010 // The next few arguments are the pointers to offloading arrays
8011 // if any. (see OffloadingArraysToPrivatize)
8012 // Finally, all other local values that are live-in into the outlined region
8013 // end up in a structure whose pointer is passed as the last argument. This
8014 // piece of data is passed in the "shared" field of the task structure. So,
8015 // we know we have to pass shareds to the task if the number of arguments is
8016 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8017 // thread id. Further, for safety, we assert that the number of arguments of
8018 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8019 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8020 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8021 assert((!HasShareds ||
8022 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8023 "Wrong number of arguments for StaleCI when shareds are present");
8024 int SharedArgOperandNo =
8025 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8026
8027 StructType *TaskWithPrivatesTy =
8028 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8029 StructType *PrivatesTy = nullptr;
8030
8031 if (!OffloadingArraysToPrivatize.empty())
8032 PrivatesTy =
8033 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8034
8036 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8037 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8038
8039 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8040 << "\n");
8041
8042 Builder.SetInsertPoint(StaleCI);
8043
8044 // Gather the arguments for emitting the runtime call.
8045 uint32_t SrcLocStrSize;
8046 Constant *SrcLocStr =
8047 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8048 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8049
8050 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8051 //
8052 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8053 // the DeviceID to the deferred task and also since
8054 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8055 Function *TaskAllocFn =
8056 !NeedsTargetTask
8057 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8058 : getOrCreateRuntimeFunctionPtr(
8059 OMPRTL___kmpc_omp_target_task_alloc);
8060
8061 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8062 // call.
8063 Value *ThreadID = getOrCreateThreadID(Ident);
8064
8065 // Argument - `sizeof_kmp_task_t` (TaskSize)
8066 // Tasksize refers to the size in bytes of kmp_task_t data structure
8067 // plus any other data to be passed to the target task, if any, which
8068 // is packed into a struct. kmp_task_t and the struct so created are
8069 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8070 Value *TaskSize = Builder.getInt64(
8071 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8072
8073 // Argument - `sizeof_shareds` (SharedsSize)
8074 // SharedsSize refers to the shareds array size in the kmp_task_t data
8075 // structure.
8076 Value *SharedsSize = Builder.getInt64(0);
8077 if (HasShareds) {
8078 auto *ArgStructAlloca =
8079 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8080 assert(ArgStructAlloca &&
8081 "Unable to find the alloca instruction corresponding to arguments "
8082 "for extracted function");
8083 auto *ArgStructType =
8084 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8085 assert(ArgStructType && "Unable to find struct type corresponding to "
8086 "arguments for extracted function");
8087 SharedsSize =
8088 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8089 }
8090
8091 // Argument - `flags`
8092 // Task is tied iff (Flags & 1) == 1.
8093 // Task is untied iff (Flags & 1) == 0.
8094 // Task is final iff (Flags & 2) == 2.
8095 // Task is not final iff (Flags & 2) == 0.
8096 // A target task is not final and is untied.
8097 Value *Flags = Builder.getInt32(0);
8098
8099 // Emit the @__kmpc_omp_task_alloc runtime call
8100 // The runtime call returns a pointer to an area where the task captured
8101 // variables must be copied before the task is run (TaskData)
8102 CallInst *TaskData = nullptr;
8103
8104 SmallVector<llvm::Value *> TaskAllocArgs = {
8105 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8106 /*flags=*/Flags,
8107 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8108 /*task_func=*/ProxyFn};
8109
8110 if (NeedsTargetTask) {
8111 assert(DeviceID && "Expected non-empty device ID.");
8112 TaskAllocArgs.push_back(DeviceID);
8113 }
8114
8115 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8116
8117 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8118 if (HasShareds) {
8119 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8121 *this, Builder, TaskData, TaskWithPrivatesTy);
8122 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8123 SharedsSize);
8124 }
8125 if (!OffloadingArraysToPrivatize.empty()) {
8126 Value *Privates =
8127 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8128 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8129 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8130 [[maybe_unused]] Type *ArrayType =
8131 getOffloadingArrayType(PtrToPrivatize);
8132 assert(ArrayType && "ArrayType cannot be nullptr");
8133
8134 Type *ElementType = PrivatesTy->getElementType(i);
8135 assert(ElementType == ArrayType &&
8136 "ElementType should match ArrayType");
8137 (void)ArrayType;
8138
8139 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8140 Builder.CreateMemCpy(
8141 Dst, Alignment, PtrToPrivatize, Alignment,
8142 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8143 }
8144 }
8145
8146 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8147
8148 // ---------------------------------------------------------------
8149 // V5.2 13.8 target construct
8150 // If the nowait clause is present, execution of the target task
8151 // may be deferred. If the nowait clause is not present, the target task is
8152 // an included task.
8153 // ---------------------------------------------------------------
8154 // The above means that the lack of a nowait on the target construct
8155 // translates to '#pragma omp task if(0)'
8156 if (!NeedsTargetTask) {
8157 if (DepArray) {
8158 Function *TaskWaitFn =
8159 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8160 Builder.CreateCall(
8161 TaskWaitFn,
8162 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8163 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8164 /*dep_list=*/DepArray,
8165 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8166 /*noalias_dep_list=*/
8168 }
8169 // Included task.
8170 Function *TaskBeginFn =
8171 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8172 Function *TaskCompleteFn =
8173 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8174 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8175 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8176 CI->setDebugLoc(StaleCI->getDebugLoc());
8177 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8178 } else if (DepArray) {
8179 // HasNoWait - meaning the task may be deferred. Call
8180 // __kmpc_omp_task_with_deps if there are dependencies,
8181 // else call __kmpc_omp_task
8182 Function *TaskFn =
8183 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8184 Builder.CreateCall(
8185 TaskFn,
8186 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8187 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8189 } else {
8190 // Emit the @__kmpc_omp_task runtime call to spawn the task
8191 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8192 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8193 }
8194
8195 StaleCI->eraseFromParent();
8196 for (Instruction *I : llvm::reverse(ToBeDeleted))
8197 I->eraseFromParent();
8198 };
8199 addOutlineInfo(std::move(OI));
8200
8201 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8202 << *(Builder.GetInsertBlock()) << "\n");
8203 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8204 << *(Builder.GetInsertBlock()->getParent()->getParent())
8205 << "\n");
8206 return Builder.saveIP();
8207}
8208
8209Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8210 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8211 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8212 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8213 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8214 if (Error Err =
8215 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8216 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8217 return Err;
8218 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8219 return Error::success();
8220}
8221
8222static void emitTargetCall(
8223 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8224 OpenMPIRBuilder::InsertPointTy AllocaIP,
8225 OpenMPIRBuilder::TargetDataInfo &Info,
8226 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8227 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8228 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8230 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8231 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8233 bool HasNoWait, Value *DynCGroupMem,
8234 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8235 // Generate a function call to the host fallback implementation of the target
8236 // region. This is called by the host when no offload entry was generated for
8237 // the target region and when the offloading call fails at runtime.
8238 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8239 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8240 Builder.restoreIP(IP);
8241 Builder.CreateCall(OutlinedFn, Args);
8242 return Builder.saveIP();
8243 };
8244
8245 bool HasDependencies = Dependencies.size() > 0;
8246 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8247
8248 OpenMPIRBuilder::TargetKernelArgs KArgs;
8249
8250 auto TaskBodyCB =
8251 [&](Value *DeviceID, Value *RTLoc,
8252 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8253 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8254 // produce any.
8255 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8256 // emitKernelLaunch makes the necessary runtime call to offload the
8257 // kernel. We then outline all that code into a separate function
8258 // ('kernel_launch_function' in the pseudo code above). This function is
8259 // then called by the target task proxy function (see
8260 // '@.omp_target_task_proxy_func' in the pseudo code above)
8261 // "@.omp_target_task_proxy_func' is generated by
8262 // emitTargetTaskProxyFunction.
8263 if (OutlinedFnID && DeviceID)
8264 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8265 EmitTargetCallFallbackCB, KArgs,
8266 DeviceID, RTLoc, TargetTaskAllocaIP);
8267
8268 // We only need to do the outlining if `DeviceID` is set to avoid calling
8269 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8270 // generating the `else` branch of an `if` clause.
8271 //
8272 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8273 // In this case, we execute the host implementation directly.
8274 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8275 }());
8276
8277 OMPBuilder.Builder.restoreIP(AfterIP);
8278 return Error::success();
8279 };
8280
8281 auto &&EmitTargetCallElse =
8282 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8283 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8284 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8285 // produce any.
8286 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8287 if (RequiresOuterTargetTask) {
8288 // Arguments that are intended to be directly forwarded to an
8289 // emitKernelLaunch call are pased as nullptr, since
8290 // OutlinedFnID=nullptr results in that call not being done.
8291 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8292 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8293 /*RTLoc=*/nullptr, AllocaIP,
8294 Dependencies, EmptyRTArgs, HasNoWait);
8295 }
8296 return EmitTargetCallFallbackCB(Builder.saveIP());
8297 }());
8298
8299 Builder.restoreIP(AfterIP);
8300 return Error::success();
8301 };
8302
8303 auto &&EmitTargetCallThen =
8304 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8305 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8306 Info.HasNoWait = HasNoWait;
8307 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8308 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8309 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8310 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8311 /*IsNonContiguous=*/true,
8312 /*ForEndCall=*/false))
8313 return Err;
8314
8315 SmallVector<Value *, 3> NumTeamsC;
8316 for (auto [DefaultVal, RuntimeVal] :
8317 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8318 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8319 : Builder.getInt32(DefaultVal));
8320
8321 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8322 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8323 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8324 if (Clause)
8325 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8326 /*isSigned=*/false);
8327 return Clause;
8328 };
8329 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8330 if (Clause)
8331 Result =
8332 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8333 Result, Clause)
8334 : Clause;
8335 };
8336
8337 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8338 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8339 SmallVector<Value *, 3> NumThreadsC;
8340 Value *MaxThreadsClause =
8341 RuntimeAttrs.TeamsThreadLimit.size() == 1
8342 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8343 : nullptr;
8344
8345 for (auto [TeamsVal, TargetVal] : zip_equal(
8346 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8347 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8348 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8349
8350 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8351 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8352
8353 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8354 }
8355
8356 unsigned NumTargetItems = Info.NumberOfPtrs;
8357 // TODO: Use correct device ID
8358 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8359 uint32_t SrcLocStrSize;
8360 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8361 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8362 llvm::omp::IdentFlag(0), 0);
8363
8364 Value *TripCount = RuntimeAttrs.LoopTripCount
8365 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8366 Builder.getInt64Ty(),
8367 /*isSigned=*/false)
8368 : Builder.getInt64(0);
8369
8370 // Request zero groupprivate bytes by default.
8371 if (!DynCGroupMem)
8372 DynCGroupMem = Builder.getInt32(0);
8373
8374 KArgs = OpenMPIRBuilder::TargetKernelArgs(
8375 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
8376 HasNoWait, DynCGroupMemFallback);
8377
8378 // Assume no error was returned because TaskBodyCB and
8379 // EmitTargetCallFallbackCB don't produce any.
8380 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8381 // The presence of certain clauses on the target directive require the
8382 // explicit generation of the target task.
8383 if (RequiresOuterTargetTask)
8384 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8385 Dependencies, KArgs.RTArgs,
8386 Info.HasNoWait);
8387
8388 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8389 EmitTargetCallFallbackCB, KArgs,
8390 DeviceID, RTLoc, AllocaIP);
8391 }());
8392
8393 Builder.restoreIP(AfterIP);
8394 return Error::success();
8395 };
8396
8397 // If we don't have an ID for the target region, it means an offload entry
8398 // wasn't created. In this case we just run the host fallback directly and
8399 // ignore any potential 'if' clauses.
8400 if (!OutlinedFnID) {
8401 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8402 return;
8403 }
8404
8405 // If there's no 'if' clause, only generate the kernel launch code path.
8406 if (!IfCond) {
8407 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8408 return;
8409 }
8410
8411 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8412 EmitTargetCallElse, AllocaIP));
8413}
8414
8415OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8416 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8417 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8418 TargetRegionEntryInfo &EntryInfo,
8419 const TargetKernelDefaultAttrs &DefaultAttrs,
8420 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8421 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8422 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8423 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8424 CustomMapperCallbackTy CustomMapperCB,
8425 const SmallVector<DependData> &Dependencies, bool HasNowait,
8426 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8427
8428 if (!updateToLocation(Loc))
8429 return InsertPointTy();
8430
8431 Builder.restoreIP(CodeGenIP);
8432
8433 Function *OutlinedFn;
8434 Constant *OutlinedFnID = nullptr;
8435 // The target region is outlined into its own function. The LLVM IR for
8436 // the target region itself is generated using the callbacks CBFunc
8437 // and ArgAccessorFuncCB
8439 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8440 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8441 return Err;
8442
8443 // If we are not on the target device, then we need to generate code
8444 // to make a remote call (offload) to the previously outlined function
8445 // that represents the target region. Do that now.
8446 if (!Config.isTargetDevice())
8447 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8448 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8449 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
8450 DynCGroupMemFallback);
8451 return Builder.saveIP();
8452}
8453
8454std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8455 StringRef FirstSeparator,
8456 StringRef Separator) {
8457 SmallString<128> Buffer;
8458 llvm::raw_svector_ostream OS(Buffer);
8459 StringRef Sep = FirstSeparator;
8460 for (StringRef Part : Parts) {
8461 OS << Sep << Part;
8462 Sep = Separator;
8463 }
8464 return OS.str().str();
8465}
8466
8467std::string
8468OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8469 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8470 Config.separator());
8471}
8472
8473GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
8474 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
8475 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8476 if (Elem.second) {
8477 assert(Elem.second->getValueType() == Ty &&
8478 "OMP internal variable has different type than requested");
8479 } else {
8480 // TODO: investigate the appropriate linkage type used for the global
8481 // variable for possibly changing that to internal or private, or maybe
8482 // create different versions of the function for different OMP internal
8483 // variables.
8484 const DataLayout &DL = M.getDataLayout();
8485 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
8486 // default global AS is 1.
8487 // See double-target-call-with-declare-target.f90 and
8488 // declare-target-vars-in-target-region.f90 libomptarget
8489 // tests.
8490 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
8491 : M.getTargetTriple().isAMDGPU()
8492 ? 0
8493 : DL.getDefaultGlobalsAddressSpace();
8494 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8497 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8498 Constant::getNullValue(Ty), Elem.first(),
8499 /*InsertBefore=*/nullptr,
8500 GlobalValue::NotThreadLocal, AddressSpaceVal);
8501 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8502 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
8503 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8504 Elem.second = GV;
8505 }
8506
8507 return Elem.second;
8508}
8509
8510Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8511 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8512 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8513 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8514}
8515
8516Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8517 LLVMContext &Ctx = Builder.getContext();
8518 Value *Null =
8520 Value *SizeGep =
8521 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8522 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8523 return SizePtrToInt;
8524}
8525
8527OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8528 std::string VarName) {
8529 llvm::Constant *MaptypesArrayInit =
8530 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8531 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8532 M, MaptypesArrayInit->getType(),
8533 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8534 VarName);
8535 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8536 return MaptypesArrayGlobal;
8537}
8538
8539void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8540 InsertPointTy AllocaIP,
8541 unsigned NumOperands,
8542 struct MapperAllocas &MapperAllocas) {
8543 if (!updateToLocation(Loc))
8544 return;
8545
8546 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8547 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8548 Builder.restoreIP(AllocaIP);
8549 AllocaInst *ArgsBase = Builder.CreateAlloca(
8550 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8551 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8552 ".offload_ptrs");
8553 AllocaInst *ArgSizes = Builder.CreateAlloca(
8554 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8555 updateToLocation(Loc);
8556 MapperAllocas.ArgsBase = ArgsBase;
8557 MapperAllocas.Args = Args;
8558 MapperAllocas.ArgSizes = ArgSizes;
8559}
8560
8561void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8562 Function *MapperFunc, Value *SrcLocInfo,
8563 Value *MaptypesArg, Value *MapnamesArg,
8564 struct MapperAllocas &MapperAllocas,
8565 int64_t DeviceID, unsigned NumOperands) {
8566 if (!updateToLocation(Loc))
8567 return;
8568
8569 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8570 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8571 Value *ArgsBaseGEP =
8572 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8573 {Builder.getInt32(0), Builder.getInt32(0)});
8574 Value *ArgsGEP =
8575 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8576 {Builder.getInt32(0), Builder.getInt32(0)});
8577 Value *ArgSizesGEP =
8578 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8579 {Builder.getInt32(0), Builder.getInt32(0)});
8580 Value *NullPtr =
8581 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8582 Builder.CreateCall(MapperFunc,
8583 {SrcLocInfo, Builder.getInt64(DeviceID),
8584 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8585 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8586}
8587
8588void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8589 TargetDataRTArgs &RTArgs,
8590 TargetDataInfo &Info,
8591 bool ForEndCall) {
8592 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8593 "expected region end call to runtime only when end call is separate");
8594 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8595 auto VoidPtrTy = UnqualPtrTy;
8596 auto VoidPtrPtrTy = UnqualPtrTy;
8597 auto Int64Ty = Type::getInt64Ty(M.getContext());
8598 auto Int64PtrTy = UnqualPtrTy;
8599
8600 if (!Info.NumberOfPtrs) {
8601 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8602 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8603 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8604 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8605 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8606 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8607 return;
8608 }
8609
8610 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8611 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8612 Info.RTArgs.BasePointersArray,
8613 /*Idx0=*/0, /*Idx1=*/0);
8614 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8615 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8616 /*Idx0=*/0,
8617 /*Idx1=*/0);
8618 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8619 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8620 /*Idx0=*/0, /*Idx1=*/0);
8621 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8622 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8623 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8624 : Info.RTArgs.MapTypesArray,
8625 /*Idx0=*/0,
8626 /*Idx1=*/0);
8627
8628 // Only emit the mapper information arrays if debug information is
8629 // requested.
8630 if (!Info.EmitDebug)
8631 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8632 else
8633 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8634 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8635 /*Idx0=*/0,
8636 /*Idx1=*/0);
8637 // If there is no user-defined mapper, set the mapper array to nullptr to
8638 // avoid an unnecessary data privatization
8639 if (!Info.HasMapper)
8640 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8641 else
8642 RTArgs.MappersArray =
8643 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8644}
8645
8646void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8647 InsertPointTy CodeGenIP,
8648 MapInfosTy &CombinedInfo,
8649 TargetDataInfo &Info) {
8650 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8651 CombinedInfo.NonContigInfo;
8652
8653 // Build an array of struct descriptor_dim and then assign it to
8654 // offload_args.
8655 //
8656 // struct descriptor_dim {
8657 // uint64_t offset;
8658 // uint64_t count;
8659 // uint64_t stride
8660 // };
8661 Type *Int64Ty = Builder.getInt64Ty();
8663 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8664 "struct.descriptor_dim");
8665
8666 enum { OffsetFD = 0, CountFD, StrideFD };
8667 // We need two index variable here since the size of "Dims" is the same as
8668 // the size of Components, however, the size of offset, count, and stride is
8669 // equal to the size of base declaration that is non-contiguous.
8670 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8671 // Skip emitting ir if dimension size is 1 since it cannot be
8672 // non-contiguous.
8673 if (NonContigInfo.Dims[I] == 1)
8674 continue;
8675 Builder.restoreIP(AllocaIP);
8676 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8677 AllocaInst *DimsAddr =
8678 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8679 Builder.restoreIP(CodeGenIP);
8680 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8681 unsigned RevIdx = EE - II - 1;
8682 Value *DimsLVal = Builder.CreateInBoundsGEP(
8683 DimsAddr->getAllocatedType(), DimsAddr,
8684 {Builder.getInt64(0), Builder.getInt64(II)});
8685 // Offset
8686 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8687 Builder.CreateAlignedStore(
8688 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8689 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8690 // Count
8691 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8692 Builder.CreateAlignedStore(
8693 NonContigInfo.Counts[L][RevIdx], CountLVal,
8694 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8695 // Stride
8696 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8697 Builder.CreateAlignedStore(
8698 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8699 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8700 }
8701 // args[I] = &dims
8702 Builder.restoreIP(CodeGenIP);
8703 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8704 DimsAddr, Builder.getPtrTy());
8705 Value *P = Builder.CreateConstInBoundsGEP2_32(
8706 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8707 Info.RTArgs.PointersArray, 0, I);
8708 Builder.CreateAlignedStore(
8709 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8710 ++L;
8711 }
8712}
8713
8714void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8715 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8716 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8717 BasicBlock *ExitBB, bool IsInit) {
8718 StringRef Prefix = IsInit ? ".init" : ".del";
8719
8720 // Evaluate if this is an array section.
8722 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8723 Value *IsArray =
8724 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8725 Value *DeleteBit = Builder.CreateAnd(
8726 MapType,
8727 Builder.getInt64(
8728 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8729 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8730 Value *DeleteCond;
8731 Value *Cond;
8732 if (IsInit) {
8733 // base != begin?
8734 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8735 // IsPtrAndObj?
8736 Value *PtrAndObjBit = Builder.CreateAnd(
8737 MapType,
8738 Builder.getInt64(
8739 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8740 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8741 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8742 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8743 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8744 DeleteCond = Builder.CreateIsNull(
8745 DeleteBit,
8746 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8747 } else {
8748 Cond = IsArray;
8749 DeleteCond = Builder.CreateIsNotNull(
8750 DeleteBit,
8751 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8752 }
8753 Cond = Builder.CreateAnd(Cond, DeleteCond);
8754 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8755
8756 emitBlock(BodyBB, MapperFn);
8757 // Get the array size by multiplying element size and element number (i.e., \p
8758 // Size).
8759 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8760 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8761 // memory allocation/deletion purpose only.
8762 Value *MapTypeArg = Builder.CreateAnd(
8763 MapType,
8764 Builder.getInt64(
8765 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8766 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8767 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8768 MapTypeArg = Builder.CreateOr(
8769 MapTypeArg,
8770 Builder.getInt64(
8771 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8772 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8773
8774 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8775 // data structure.
8776 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8777 ArraySize, MapTypeArg, MapName};
8778 Builder.CreateCall(
8779 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8780 OffloadingArgs);
8781}
8782
8783Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8784 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8785 llvm::Value *BeginArg)>
8786 GenMapInfoCB,
8787 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8788 SmallVector<Type *> Params;
8789 Params.emplace_back(Builder.getPtrTy());
8790 Params.emplace_back(Builder.getPtrTy());
8791 Params.emplace_back(Builder.getPtrTy());
8792 Params.emplace_back(Builder.getInt64Ty());
8793 Params.emplace_back(Builder.getInt64Ty());
8794 Params.emplace_back(Builder.getPtrTy());
8795
8796 auto *FnTy =
8797 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8798
8799 SmallString<64> TyStr;
8800 raw_svector_ostream Out(TyStr);
8801 Function *MapperFn =
8803 MapperFn->addFnAttr(Attribute::NoInline);
8804 MapperFn->addFnAttr(Attribute::NoUnwind);
8805 MapperFn->addParamAttr(0, Attribute::NoUndef);
8806 MapperFn->addParamAttr(1, Attribute::NoUndef);
8807 MapperFn->addParamAttr(2, Attribute::NoUndef);
8808 MapperFn->addParamAttr(3, Attribute::NoUndef);
8809 MapperFn->addParamAttr(4, Attribute::NoUndef);
8810 MapperFn->addParamAttr(5, Attribute::NoUndef);
8811
8812 // Start the mapper function code generation.
8813 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8814 auto SavedIP = Builder.saveIP();
8815 Builder.SetInsertPoint(EntryBB);
8816
8817 Value *MapperHandle = MapperFn->getArg(0);
8818 Value *BaseIn = MapperFn->getArg(1);
8819 Value *BeginIn = MapperFn->getArg(2);
8820 Value *Size = MapperFn->getArg(3);
8821 Value *MapType = MapperFn->getArg(4);
8822 Value *MapName = MapperFn->getArg(5);
8823
8824 // Compute the starting and end addresses of array elements.
8825 // Prepare common arguments for array initiation and deletion.
8826 // Convert the size in bytes into the number of array elements.
8827 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8828 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8829 Value *PtrBegin = BeginIn;
8830 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8831
8832 // Emit array initiation if this is an array section and \p MapType indicates
8833 // that memory allocation is required.
8834 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8835 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8836 MapType, MapName, ElementSize, HeadBB,
8837 /*IsInit=*/true);
8838
8839 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8840
8841 // Emit the loop header block.
8842 emitBlock(HeadBB, MapperFn);
8843 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8844 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8845 // Evaluate whether the initial condition is satisfied.
8846 Value *IsEmpty =
8847 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8848 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8849
8850 // Emit the loop body block.
8851 emitBlock(BodyBB, MapperFn);
8852 BasicBlock *LastBB = BodyBB;
8853 PHINode *PtrPHI =
8854 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8855 PtrPHI->addIncoming(PtrBegin, HeadBB);
8856
8857 // Get map clause information. Fill up the arrays with all mapped variables.
8858 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8859 if (!Info)
8860 return Info.takeError();
8861
8862 // Call the runtime API __tgt_mapper_num_components to get the number of
8863 // pre-existing components.
8864 Value *OffloadingArgs[] = {MapperHandle};
8865 Value *PreviousSize = Builder.CreateCall(
8866 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8867 OffloadingArgs);
8868 Value *ShiftedPreviousSize =
8869 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8870
8871 // Fill up the runtime mapper handle for all components.
8872 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8873 Value *CurBaseArg = Info->BasePointers[I];
8874 Value *CurBeginArg = Info->Pointers[I];
8875 Value *CurSizeArg = Info->Sizes[I];
8876 Value *CurNameArg = Info->Names.size()
8877 ? Info->Names[I]
8878 : Constant::getNullValue(Builder.getPtrTy());
8879
8880 // Extract the MEMBER_OF field from the map type.
8881 Value *OriMapType = Builder.getInt64(
8882 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8883 Info->Types[I]));
8884 Value *MemberMapType =
8885 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8886
8887 // Combine the map type inherited from user-defined mapper with that
8888 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8889 // bits of the \a MapType, which is the input argument of the mapper
8890 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8891 // bits of MemberMapType.
8892 // [OpenMP 5.0], 1.2.6. map-type decay.
8893 // | alloc | to | from | tofrom | release | delete
8894 // ----------------------------------------------------------
8895 // alloc | alloc | alloc | alloc | alloc | release | delete
8896 // to | alloc | to | alloc | to | release | delete
8897 // from | alloc | alloc | from | from | release | delete
8898 // tofrom | alloc | to | from | tofrom | release | delete
8899 Value *LeftToFrom = Builder.CreateAnd(
8900 MapType,
8901 Builder.getInt64(
8902 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8903 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8904 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8905 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8906 BasicBlock *AllocElseBB =
8907 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8908 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8909 BasicBlock *ToElseBB =
8910 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8911 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8912 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8913 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8914 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8915 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8916 emitBlock(AllocBB, MapperFn);
8917 Value *AllocMapType = Builder.CreateAnd(
8918 MemberMapType,
8919 Builder.getInt64(
8920 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8921 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8922 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8923 Builder.CreateBr(EndBB);
8924 emitBlock(AllocElseBB, MapperFn);
8925 Value *IsTo = Builder.CreateICmpEQ(
8926 LeftToFrom,
8927 Builder.getInt64(
8928 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8929 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8930 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8931 // In case of to, clear OMP_MAP_FROM.
8932 emitBlock(ToBB, MapperFn);
8933 Value *ToMapType = Builder.CreateAnd(
8934 MemberMapType,
8935 Builder.getInt64(
8936 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8937 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8938 Builder.CreateBr(EndBB);
8939 emitBlock(ToElseBB, MapperFn);
8940 Value *IsFrom = Builder.CreateICmpEQ(
8941 LeftToFrom,
8942 Builder.getInt64(
8943 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8944 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8945 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8946 // In case of from, clear OMP_MAP_TO.
8947 emitBlock(FromBB, MapperFn);
8948 Value *FromMapType = Builder.CreateAnd(
8949 MemberMapType,
8950 Builder.getInt64(
8951 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8952 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8953 // In case of tofrom, do nothing.
8954 emitBlock(EndBB, MapperFn);
8955 LastBB = EndBB;
8956 PHINode *CurMapType =
8957 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8958 CurMapType->addIncoming(AllocMapType, AllocBB);
8959 CurMapType->addIncoming(ToMapType, ToBB);
8960 CurMapType->addIncoming(FromMapType, FromBB);
8961 CurMapType->addIncoming(MemberMapType, ToElseBB);
8962
8963 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8964 CurSizeArg, CurMapType, CurNameArg};
8965
8966 auto ChildMapperFn = CustomMapperCB(I);
8967 if (!ChildMapperFn)
8968 return ChildMapperFn.takeError();
8969 if (*ChildMapperFn) {
8970 // Call the corresponding mapper function.
8971 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8972 } else {
8973 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8974 // data structure.
8975 Builder.CreateCall(
8976 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8977 OffloadingArgs);
8978 }
8979 }
8980
8981 // Update the pointer to point to the next element that needs to be mapped,
8982 // and check whether we have mapped all elements.
8983 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8984 "omp.arraymap.next");
8985 PtrPHI->addIncoming(PtrNext, LastBB);
8986 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8987 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8988 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8989
8990 emitBlock(ExitBB, MapperFn);
8991 // Emit array deletion if this is an array section and \p MapType indicates
8992 // that deletion is required.
8993 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8994 MapType, MapName, ElementSize, DoneBB,
8995 /*IsInit=*/false);
8996
8997 // Emit the function exit block.
8998 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8999
9000 Builder.CreateRetVoid();
9001 Builder.restoreIP(SavedIP);
9002 return MapperFn;
9003}
9004
9005Error OpenMPIRBuilder::emitOffloadingArrays(
9006 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9007 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9008 bool IsNonContiguous,
9009 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9010
9011 // Reset the array information.
9012 Info.clearArrayInfo();
9013 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9014
9015 if (Info.NumberOfPtrs == 0)
9016 return Error::success();
9017
9018 Builder.restoreIP(AllocaIP);
9019 // Detect if we have any capture size requiring runtime evaluation of the
9020 // size so that a constant array could be eventually used.
9021 ArrayType *PointerArrayType =
9022 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9023
9024 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9025 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9026
9027 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9028 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9029 AllocaInst *MappersArray = Builder.CreateAlloca(
9030 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9031 Info.RTArgs.MappersArray = MappersArray;
9032
9033 // If we don't have any VLA types or other types that require runtime
9034 // evaluation, we can use a constant array for the map sizes, otherwise we
9035 // need to fill up the arrays as we do for the pointers.
9036 Type *Int64Ty = Builder.getInt64Ty();
9037 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9038 ConstantInt::get(Int64Ty, 0));
9039 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9040 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9041 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9042 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9043 if (IsNonContiguous &&
9044 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9045 CombinedInfo.Types[I] &
9046 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9047 ConstSizes[I] =
9048 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9049 else
9050 ConstSizes[I] = CI;
9051 continue;
9052 }
9053 }
9054 RuntimeSizes.set(I);
9055 }
9056
9057 if (RuntimeSizes.all()) {
9058 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9059 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9060 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9061 restoreIPandDebugLoc(Builder, CodeGenIP);
9062 } else {
9063 auto *SizesArrayInit = ConstantArray::get(
9064 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9065 std::string Name = createPlatformSpecificName({"offload_sizes"});
9066 auto *SizesArrayGbl =
9067 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9068 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9069 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9070
9071 if (!RuntimeSizes.any()) {
9072 Info.RTArgs.SizesArray = SizesArrayGbl;
9073 } else {
9074 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9075 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9076 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9077 AllocaInst *Buffer = Builder.CreateAlloca(
9078 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9079 Buffer->setAlignment(OffloadSizeAlign);
9080 restoreIPandDebugLoc(Builder, CodeGenIP);
9081 Builder.CreateMemCpy(
9082 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9083 SizesArrayGbl, OffloadSizeAlign,
9084 Builder.getIntN(
9085 IndexSize,
9086 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9087
9088 Info.RTArgs.SizesArray = Buffer;
9089 }
9090 restoreIPandDebugLoc(Builder, CodeGenIP);
9091 }
9092
9093 // The map types are always constant so we don't need to generate code to
9094 // fill arrays. Instead, we create an array constant.
9096 for (auto mapFlag : CombinedInfo.Types)
9097 Mapping.push_back(
9098 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9099 mapFlag));
9100 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9101 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9102 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9103
9104 // The information types are only built if provided.
9105 if (!CombinedInfo.Names.empty()) {
9106 auto *MapNamesArrayGbl = createOffloadMapnames(
9107 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9108 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9109 Info.EmitDebug = true;
9110 } else {
9111 Info.RTArgs.MapNamesArray =
9112 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9113 Info.EmitDebug = false;
9114 }
9115
9116 // If there's a present map type modifier, it must not be applied to the end
9117 // of a region, so generate a separate map type array in that case.
9118 if (Info.separateBeginEndCalls()) {
9119 bool EndMapTypesDiffer = false;
9120 for (uint64_t &Type : Mapping) {
9121 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9122 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9123 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9124 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9125 EndMapTypesDiffer = true;
9126 }
9127 }
9128 if (EndMapTypesDiffer) {
9129 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9130 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9131 }
9132 }
9133
9134 PointerType *PtrTy = Builder.getPtrTy();
9135 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9136 Value *BPVal = CombinedInfo.BasePointers[I];
9137 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9138 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9139 0, I);
9140 Builder.CreateAlignedStore(BPVal, BP,
9141 M.getDataLayout().getPrefTypeAlign(PtrTy));
9142
9143 if (Info.requiresDevicePointerInfo()) {
9144 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9145 CodeGenIP = Builder.saveIP();
9146 Builder.restoreIP(AllocaIP);
9147 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9148 Builder.restoreIP(CodeGenIP);
9149 if (DeviceAddrCB)
9150 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9151 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9152 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9153 if (DeviceAddrCB)
9154 DeviceAddrCB(I, BP);
9155 }
9156 }
9157
9158 Value *PVal = CombinedInfo.Pointers[I];
9159 Value *P = Builder.CreateConstInBoundsGEP2_32(
9160 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9161 I);
9162 // TODO: Check alignment correct.
9163 Builder.CreateAlignedStore(PVal, P,
9164 M.getDataLayout().getPrefTypeAlign(PtrTy));
9165
9166 if (RuntimeSizes.test(I)) {
9167 Value *S = Builder.CreateConstInBoundsGEP2_32(
9168 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9169 /*Idx0=*/0,
9170 /*Idx1=*/I);
9171 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9172 Int64Ty,
9173 /*isSigned=*/true),
9174 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9175 }
9176 // Fill up the mapper array.
9177 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9178 Value *MFunc = ConstantPointerNull::get(PtrTy);
9179
9180 auto CustomMFunc = CustomMapperCB(I);
9181 if (!CustomMFunc)
9182 return CustomMFunc.takeError();
9183 if (*CustomMFunc)
9184 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9185
9186 Value *MAddr = Builder.CreateInBoundsGEP(
9187 MappersArray->getAllocatedType(), MappersArray,
9188 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9189 Builder.CreateAlignedStore(
9190 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9191 }
9192
9193 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9194 Info.NumberOfPtrs == 0)
9195 return Error::success();
9196 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9197 return Error::success();
9198}
9199
9200void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9201 BasicBlock *CurBB = Builder.GetInsertBlock();
9202
9203 if (!CurBB || CurBB->getTerminator()) {
9204 // If there is no insert point or the previous block is already
9205 // terminated, don't touch it.
9206 } else {
9207 // Otherwise, create a fall-through branch.
9208 Builder.CreateBr(Target);
9209 }
9210
9211 Builder.ClearInsertionPoint();
9212}
9213
9214void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9215 bool IsFinished) {
9216 BasicBlock *CurBB = Builder.GetInsertBlock();
9217
9218 // Fall out of the current block (if necessary).
9219 emitBranch(BB);
9220
9221 if (IsFinished && BB->use_empty()) {
9222 BB->eraseFromParent();
9223 return;
9224 }
9225
9226 // Place the block after the current block, if possible, or else at
9227 // the end of the function.
9228 if (CurBB && CurBB->getParent())
9229 CurFn->insert(std::next(CurBB->getIterator()), BB);
9230 else
9231 CurFn->insert(CurFn->end(), BB);
9232 Builder.SetInsertPoint(BB);
9233}
9234
9235Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9236 BodyGenCallbackTy ElseGen,
9237 InsertPointTy AllocaIP) {
9238 // If the condition constant folds and can be elided, try to avoid emitting
9239 // the condition and the dead arm of the if/else.
9240 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9241 auto CondConstant = CI->getSExtValue();
9242 if (CondConstant)
9243 return ThenGen(AllocaIP, Builder.saveIP());
9244
9245 return ElseGen(AllocaIP, Builder.saveIP());
9246 }
9247
9248 Function *CurFn = Builder.GetInsertBlock()->getParent();
9249
9250 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9251 // emit the conditional branch.
9252 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9253 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9254 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9255 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9256 // Emit the 'then' code.
9257 emitBlock(ThenBlock, CurFn);
9258 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9259 return Err;
9260 emitBranch(ContBlock);
9261 // Emit the 'else' code if present.
9262 // There is no need to emit line number for unconditional branch.
9263 emitBlock(ElseBlock, CurFn);
9264 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9265 return Err;
9266 // There is no need to emit line number for unconditional branch.
9267 emitBranch(ContBlock);
9268 // Emit the continuation block for code after the if.
9269 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9270 return Error::success();
9271}
9272
9273bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9274 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9277 "Unexpected Atomic Ordering.");
9278
9279 bool Flush = false;
9281
9282 switch (AK) {
9283 case Read:
9286 FlushAO = AtomicOrdering::Acquire;
9287 Flush = true;
9288 }
9289 break;
9290 case Write:
9291 case Compare:
9292 case Update:
9295 FlushAO = AtomicOrdering::Release;
9296 Flush = true;
9297 }
9298 break;
9299 case Capture:
9300 switch (AO) {
9302 FlushAO = AtomicOrdering::Acquire;
9303 Flush = true;
9304 break;
9306 FlushAO = AtomicOrdering::Release;
9307 Flush = true;
9308 break;
9312 Flush = true;
9313 break;
9314 default:
9315 // do nothing - leave silently.
9316 break;
9317 }
9318 }
9319
9320 if (Flush) {
9321 // Currently Flush RT call still doesn't take memory_ordering, so for when
9322 // that happens, this tries to do the resolution of which atomic ordering
9323 // to use with but issue the flush call
9324 // TODO: pass `FlushAO` after memory ordering support is added
9325 (void)FlushAO;
9326 emitFlush(Loc);
9327 }
9328
9329 // for AO == AtomicOrdering::Monotonic and all other case combinations
9330 // do nothing
9331 return Flush;
9332}
9333
9334OpenMPIRBuilder::InsertPointTy
9335OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9336 AtomicOpValue &X, AtomicOpValue &V,
9337 AtomicOrdering AO, InsertPointTy AllocaIP) {
9338 if (!updateToLocation(Loc))
9339 return Loc.IP;
9340
9341 assert(X.Var->getType()->isPointerTy() &&
9342 "OMP Atomic expects a pointer to target memory");
9343 Type *XElemTy = X.ElemTy;
9344 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9345 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9346 "OMP atomic read expected a scalar type");
9347
9348 Value *XRead = nullptr;
9349
9350 if (XElemTy->isIntegerTy()) {
9351 LoadInst *XLD =
9352 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9353 XLD->setAtomic(AO);
9354 XRead = cast<Value>(XLD);
9355 } else if (XElemTy->isStructTy()) {
9356 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9357 // target does not support `atomicrmw` of the size of the struct
9358 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9359 OldVal->setAtomic(AO);
9360 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9361 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9362 OpenMPIRBuilder::AtomicInfo atomicInfo(
9363 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9364 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9365 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9366 XRead = AtomicLoadRes.first;
9367 OldVal->eraseFromParent();
9368 } else {
9369 // We need to perform atomic op as integer
9370 IntegerType *IntCastTy =
9371 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9372 LoadInst *XLoad =
9373 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9374 XLoad->setAtomic(AO);
9375 if (XElemTy->isFloatingPointTy()) {
9376 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9377 } else {
9378 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9379 }
9380 }
9381 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9382 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9383 return Builder.saveIP();
9384}
9385
9386OpenMPIRBuilder::InsertPointTy
9387OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9388 AtomicOpValue &X, Value *Expr,
9389 AtomicOrdering AO, InsertPointTy AllocaIP) {
9390 if (!updateToLocation(Loc))
9391 return Loc.IP;
9392
9393 assert(X.Var->getType()->isPointerTy() &&
9394 "OMP Atomic expects a pointer to target memory");
9395 Type *XElemTy = X.ElemTy;
9396 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9397 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9398 "OMP atomic write expected a scalar type");
9399
9400 if (XElemTy->isIntegerTy()) {
9401 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9402 XSt->setAtomic(AO);
9403 } else if (XElemTy->isStructTy()) {
9404 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9405 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9406 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9407 OpenMPIRBuilder::AtomicInfo atomicInfo(
9408 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9409 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9410 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9411 OldVal->eraseFromParent();
9412 } else {
9413 // We need to bitcast and perform atomic op as integers
9414 IntegerType *IntCastTy =
9415 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9416 Value *ExprCast =
9417 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9418 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9419 XSt->setAtomic(AO);
9420 }
9421
9422 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9423 return Builder.saveIP();
9424}
9425
9426OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9427 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9428 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9429 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9430 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9431 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9432 if (!updateToLocation(Loc))
9433 return Loc.IP;
9434
9435 LLVM_DEBUG({
9436 Type *XTy = X.Var->getType();
9437 assert(XTy->isPointerTy() &&
9438 "OMP Atomic expects a pointer to target memory");
9439 Type *XElemTy = X.ElemTy;
9440 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9441 XElemTy->isPointerTy()) &&
9442 "OMP atomic update expected a scalar type");
9443 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9444 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9445 "OpenMP atomic does not support LT or GT operations");
9446 });
9447
9448 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9449 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9450 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9451 if (!AtomicResult)
9452 return AtomicResult.takeError();
9453 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9454 return Builder.saveIP();
9455}
9456
9457// FIXME: Duplicating AtomicExpand
9458Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9459 AtomicRMWInst::BinOp RMWOp) {
9460 switch (RMWOp) {
9461 case AtomicRMWInst::Add:
9462 return Builder.CreateAdd(Src1, Src2);
9463 case AtomicRMWInst::Sub:
9464 return Builder.CreateSub(Src1, Src2);
9465 case AtomicRMWInst::And:
9466 return Builder.CreateAnd(Src1, Src2);
9468 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9469 case AtomicRMWInst::Or:
9470 return Builder.CreateOr(Src1, Src2);
9471 case AtomicRMWInst::Xor:
9472 return Builder.CreateXor(Src1, Src2);
9477 case AtomicRMWInst::Max:
9478 case AtomicRMWInst::Min:
9489 llvm_unreachable("Unsupported atomic update operation");
9490 }
9491 llvm_unreachable("Unsupported atomic update operation");
9492}
9493
9494Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9495 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9497 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9498 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9499 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9500 // or a complex datatype.
9501 bool emitRMWOp = false;
9502 switch (RMWOp) {
9503 case AtomicRMWInst::Add:
9504 case AtomicRMWInst::And:
9506 case AtomicRMWInst::Or:
9507 case AtomicRMWInst::Xor:
9509 emitRMWOp = XElemTy;
9510 break;
9511 case AtomicRMWInst::Sub:
9512 emitRMWOp = (IsXBinopExpr && XElemTy);
9513 break;
9514 default:
9515 emitRMWOp = false;
9516 }
9517 emitRMWOp &= XElemTy->isIntegerTy();
9518
9519 std::pair<Value *, Value *> Res;
9520 if (emitRMWOp) {
9521 AtomicRMWInst *RMWInst =
9522 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9523 if (T.isAMDGPU()) {
9524 if (IsIgnoreDenormalMode)
9525 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9526 llvm::MDNode::get(Builder.getContext(), {}));
9527 if (!IsFineGrainedMemory)
9528 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9529 llvm::MDNode::get(Builder.getContext(), {}));
9530 if (!IsRemoteMemory)
9531 RMWInst->setMetadata("amdgpu.no.remote.memory",
9532 llvm::MDNode::get(Builder.getContext(), {}));
9533 }
9534 Res.first = RMWInst;
9535 // not needed except in case of postfix captures. Generate anyway for
9536 // consistency with the else part. Will be removed with any DCE pass.
9537 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9538 if (RMWOp == AtomicRMWInst::Xchg)
9539 Res.second = Res.first;
9540 else
9541 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9542 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9543 XElemTy->isStructTy()) {
9544 LoadInst *OldVal =
9545 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9546 OldVal->setAtomic(AO);
9547 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9548 unsigned LoadSize =
9549 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9550
9551 OpenMPIRBuilder::AtomicInfo atomicInfo(
9552 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9553 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9554 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9555 BasicBlock *CurBB = Builder.GetInsertBlock();
9556 Instruction *CurBBTI = CurBB->getTerminator();
9557 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9558 BasicBlock *ExitBB =
9559 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9560 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9561 X->getName() + ".atomic.cont");
9562 ContBB->getTerminator()->eraseFromParent();
9563 Builder.restoreIP(AllocaIP);
9564 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9565 NewAtomicAddr->setName(X->getName() + "x.new.val");
9566 Builder.SetInsertPoint(ContBB);
9567 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9568 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9569 Value *OldExprVal = PHI;
9570 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9571 if (!CBResult)
9572 return CBResult.takeError();
9573 Value *Upd = *CBResult;
9574 Builder.CreateStore(Upd, NewAtomicAddr);
9577 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9578 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9579 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9580 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9581 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9582 OldVal->eraseFromParent();
9583 Res.first = OldExprVal;
9584 Res.second = Upd;
9585
9586 if (UnreachableInst *ExitTI =
9588 CurBBTI->eraseFromParent();
9589 Builder.SetInsertPoint(ExitBB);
9590 } else {
9591 Builder.SetInsertPoint(ExitTI);
9592 }
9593 } else {
9594 IntegerType *IntCastTy =
9595 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9596 LoadInst *OldVal =
9597 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9598 OldVal->setAtomic(AO);
9599 // CurBB
9600 // | /---\
9601 // ContBB |
9602 // | \---/
9603 // ExitBB
9604 BasicBlock *CurBB = Builder.GetInsertBlock();
9605 Instruction *CurBBTI = CurBB->getTerminator();
9606 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9607 BasicBlock *ExitBB =
9608 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9609 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9610 X->getName() + ".atomic.cont");
9611 ContBB->getTerminator()->eraseFromParent();
9612 Builder.restoreIP(AllocaIP);
9613 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9614 NewAtomicAddr->setName(X->getName() + "x.new.val");
9615 Builder.SetInsertPoint(ContBB);
9616 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9617 PHI->addIncoming(OldVal, CurBB);
9618 bool IsIntTy = XElemTy->isIntegerTy();
9619 Value *OldExprVal = PHI;
9620 if (!IsIntTy) {
9621 if (XElemTy->isFloatingPointTy()) {
9622 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9623 X->getName() + ".atomic.fltCast");
9624 } else {
9625 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9626 X->getName() + ".atomic.ptrCast");
9627 }
9628 }
9629
9630 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9631 if (!CBResult)
9632 return CBResult.takeError();
9633 Value *Upd = *CBResult;
9634 Builder.CreateStore(Upd, NewAtomicAddr);
9635 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9638 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9639 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9640 Result->setVolatile(VolatileX);
9641 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9642 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9643 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9644 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9645
9646 Res.first = OldExprVal;
9647 Res.second = Upd;
9648
9649 // set Insertion point in exit block
9650 if (UnreachableInst *ExitTI =
9652 CurBBTI->eraseFromParent();
9653 Builder.SetInsertPoint(ExitBB);
9654 } else {
9655 Builder.SetInsertPoint(ExitTI);
9656 }
9657 }
9658
9659 return Res;
9660}
9661
9662OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9663 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9664 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9665 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9666 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9667 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9668 if (!updateToLocation(Loc))
9669 return Loc.IP;
9670
9671 LLVM_DEBUG({
9672 Type *XTy = X.Var->getType();
9673 assert(XTy->isPointerTy() &&
9674 "OMP Atomic expects a pointer to target memory");
9675 Type *XElemTy = X.ElemTy;
9676 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9677 XElemTy->isPointerTy()) &&
9678 "OMP atomic capture expected a scalar type");
9679 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9680 "OpenMP atomic does not support LT or GT operations");
9681 });
9682
9683 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9684 // 'x' is simply atomically rewritten with 'expr'.
9685 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9686 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9687 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9688 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9689 if (!AtomicResult)
9690 return AtomicResult.takeError();
9691 Value *CapturedVal =
9692 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9693 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9694
9695 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9696 return Builder.saveIP();
9697}
9698
9699OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9700 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9701 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9702 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9703 bool IsFailOnly) {
9704
9706 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9707 IsPostfixUpdate, IsFailOnly, Failure);
9708}
9709
9710OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9711 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9712 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9713 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9714 bool IsFailOnly, AtomicOrdering Failure) {
9715
9716 if (!updateToLocation(Loc))
9717 return Loc.IP;
9718
9719 assert(X.Var->getType()->isPointerTy() &&
9720 "OMP atomic expects a pointer to target memory");
9721 // compare capture
9722 if (V.Var) {
9723 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9724 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9725 }
9726
9727 bool IsInteger = E->getType()->isIntegerTy();
9728
9729 if (Op == OMPAtomicCompareOp::EQ) {
9730 AtomicCmpXchgInst *Result = nullptr;
9731 if (!IsInteger) {
9732 IntegerType *IntCastTy =
9733 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9734 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9735 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9736 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9737 AO, Failure);
9738 } else {
9739 Result =
9740 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9741 }
9742
9743 if (V.Var) {
9744 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9745 if (!IsInteger)
9746 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9747 assert(OldValue->getType() == V.ElemTy &&
9748 "OldValue and V must be of same type");
9749 if (IsPostfixUpdate) {
9750 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9751 } else {
9752 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9753 if (IsFailOnly) {
9754 // CurBB----
9755 // | |
9756 // v |
9757 // ContBB |
9758 // | |
9759 // v |
9760 // ExitBB <-
9761 //
9762 // where ContBB only contains the store of old value to 'v'.
9763 BasicBlock *CurBB = Builder.GetInsertBlock();
9764 Instruction *CurBBTI = CurBB->getTerminator();
9765 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9766 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9767 CurBBTI, X.Var->getName() + ".atomic.exit");
9768 BasicBlock *ContBB = CurBB->splitBasicBlock(
9769 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9770 ContBB->getTerminator()->eraseFromParent();
9771 CurBB->getTerminator()->eraseFromParent();
9772
9773 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9774
9775 Builder.SetInsertPoint(ContBB);
9776 Builder.CreateStore(OldValue, V.Var);
9777 Builder.CreateBr(ExitBB);
9778
9779 if (UnreachableInst *ExitTI =
9781 CurBBTI->eraseFromParent();
9782 Builder.SetInsertPoint(ExitBB);
9783 } else {
9784 Builder.SetInsertPoint(ExitTI);
9785 }
9786 } else {
9787 Value *CapturedValue =
9788 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9789 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9790 }
9791 }
9792 }
9793 // The comparison result has to be stored.
9794 if (R.Var) {
9795 assert(R.Var->getType()->isPointerTy() &&
9796 "r.var must be of pointer type");
9797 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9798
9799 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9800 Value *ResultCast = R.IsSigned
9801 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9802 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9803 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9804 }
9805 } else {
9806 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9807 "Op should be either max or min at this point");
9808 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9809
9810 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9811 // Let's take max as example.
9812 // OpenMP form:
9813 // x = x > expr ? expr : x;
9814 // LLVM form:
9815 // *ptr = *ptr > val ? *ptr : val;
9816 // We need to transform to LLVM form.
9817 // x = x <= expr ? x : expr;
9819 if (IsXBinopExpr) {
9820 if (IsInteger) {
9821 if (X.IsSigned)
9822 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9824 else
9825 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9827 } else {
9828 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9830 }
9831 } else {
9832 if (IsInteger) {
9833 if (X.IsSigned)
9834 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9836 else
9837 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9839 } else {
9840 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9842 }
9843 }
9844
9845 AtomicRMWInst *OldValue =
9846 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9847 if (V.Var) {
9848 Value *CapturedValue = nullptr;
9849 if (IsPostfixUpdate) {
9850 CapturedValue = OldValue;
9851 } else {
9852 CmpInst::Predicate Pred;
9853 switch (NewOp) {
9854 case AtomicRMWInst::Max:
9855 Pred = CmpInst::ICMP_SGT;
9856 break;
9858 Pred = CmpInst::ICMP_UGT;
9859 break;
9861 Pred = CmpInst::FCMP_OGT;
9862 break;
9863 case AtomicRMWInst::Min:
9864 Pred = CmpInst::ICMP_SLT;
9865 break;
9867 Pred = CmpInst::ICMP_ULT;
9868 break;
9870 Pred = CmpInst::FCMP_OLT;
9871 break;
9872 default:
9873 llvm_unreachable("unexpected comparison op");
9874 }
9875 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9876 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9877 }
9878 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9879 }
9880 }
9881
9882 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9883
9884 return Builder.saveIP();
9885}
9886
9887OpenMPIRBuilder::InsertPointOrErrorTy
9888OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9889 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9890 Value *NumTeamsUpper, Value *ThreadLimit,
9891 Value *IfExpr) {
9892 if (!updateToLocation(Loc))
9893 return InsertPointTy();
9894
9895 uint32_t SrcLocStrSize;
9896 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9897 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9898 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9899
9900 // Outer allocation basicblock is the entry block of the current function.
9901 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9902 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9903 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9904 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9905 }
9906
9907 // The current basic block is split into four basic blocks. After outlining,
9908 // they will be mapped as follows:
9909 // ```
9910 // def current_fn() {
9911 // current_basic_block:
9912 // br label %teams.exit
9913 // teams.exit:
9914 // ; instructions after teams
9915 // }
9916 //
9917 // def outlined_fn() {
9918 // teams.alloca:
9919 // br label %teams.body
9920 // teams.body:
9921 // ; instructions within teams body
9922 // }
9923 // ```
9924 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9925 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9926 BasicBlock *AllocaBB =
9927 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9928
9929 bool SubClausesPresent =
9930 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9931 // Push num_teams
9932 if (!Config.isTargetDevice() && SubClausesPresent) {
9933 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9934 "if lowerbound is non-null, then upperbound must also be non-null "
9935 "for bounds on num_teams");
9936
9937 if (NumTeamsUpper == nullptr)
9938 NumTeamsUpper = Builder.getInt32(0);
9939
9940 if (NumTeamsLower == nullptr)
9941 NumTeamsLower = NumTeamsUpper;
9942
9943 if (IfExpr) {
9944 assert(IfExpr->getType()->isIntegerTy() &&
9945 "argument to if clause must be an integer value");
9946
9947 // upper = ifexpr ? upper : 1
9948 if (IfExpr->getType() != Int1)
9949 IfExpr = Builder.CreateICmpNE(IfExpr,
9950 ConstantInt::get(IfExpr->getType(), 0));
9951 NumTeamsUpper = Builder.CreateSelect(
9952 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9953
9954 // lower = ifexpr ? lower : 1
9955 NumTeamsLower = Builder.CreateSelect(
9956 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9957 }
9958
9959 if (ThreadLimit == nullptr)
9960 ThreadLimit = Builder.getInt32(0);
9961
9962 Value *ThreadNum = getOrCreateThreadID(Ident);
9963 Builder.CreateCall(
9964 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9965 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9966 }
9967 // Generate the body of teams.
9968 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9969 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9970 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9971 return Err;
9972
9973 OutlineInfo OI;
9974 OI.EntryBB = AllocaBB;
9975 OI.ExitBB = ExitBB;
9976 OI.OuterAllocaBB = &OuterAllocaBB;
9977
9978 // Insert fake values for global tid and bound tid.
9980 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9981 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9982 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9983 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9984 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9985
9986 auto HostPostOutlineCB = [this, Ident,
9987 ToBeDeleted](Function &OutlinedFn) mutable {
9988 // The stale call instruction will be replaced with a new call instruction
9989 // for runtime call with the outlined function.
9990
9991 assert(OutlinedFn.hasOneUse() &&
9992 "there must be a single user for the outlined function");
9993 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9994 ToBeDeleted.push_back(StaleCI);
9995
9996 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9997 "Outlined function must have two or three arguments only");
9998
9999 bool HasShared = OutlinedFn.arg_size() == 3;
10000
10001 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10002 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10003 if (HasShared)
10004 OutlinedFn.getArg(2)->setName("data");
10005
10006 // Call to the runtime function for teams in the current function.
10007 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10008 "outlined function.");
10009 Builder.SetInsertPoint(StaleCI);
10011 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10012 if (HasShared)
10013 Args.push_back(StaleCI->getArgOperand(2));
10014 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
10015 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10016 Args);
10017
10018 for (Instruction *I : llvm::reverse(ToBeDeleted))
10019 I->eraseFromParent();
10020 };
10021
10022 if (!Config.isTargetDevice())
10023 OI.PostOutlineCB = HostPostOutlineCB;
10024
10025 addOutlineInfo(std::move(OI));
10026
10027 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10028
10029 return Builder.saveIP();
10030}
10031
10032OpenMPIRBuilder::InsertPointOrErrorTy
10033OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10034 InsertPointTy OuterAllocaIP,
10035 BodyGenCallbackTy BodyGenCB) {
10036 if (!updateToLocation(Loc))
10037 return InsertPointTy();
10038
10039 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10040
10041 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10042 BasicBlock *BodyBB =
10043 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10044 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10045 }
10046 BasicBlock *ExitBB =
10047 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10048 BasicBlock *BodyBB =
10049 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10050 BasicBlock *AllocaBB =
10051 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10052
10053 // Generate the body of distribute clause
10054 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10055 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10056 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10057 return Err;
10058
10059 // When using target we use different runtime functions which require a
10060 // callback.
10061 if (Config.isTargetDevice()) {
10062 OutlineInfo OI;
10063 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10064 OI.EntryBB = AllocaBB;
10065 OI.ExitBB = ExitBB;
10066
10067 addOutlineInfo(std::move(OI));
10068 }
10069 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10070
10071 return Builder.saveIP();
10072}
10073
10075OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10076 std::string VarName) {
10077 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10079 Names.size()),
10080 Names);
10081 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10082 M, MapNamesArrayInit->getType(),
10083 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10084 VarName);
10085 return MapNamesArrayGlobal;
10086}
10087
10088// Create all simple and struct types exposed by the runtime and remember
10089// the llvm::PointerTypes of them for easy access later.
10090void OpenMPIRBuilder::initializeTypes(Module &M) {
10091 LLVMContext &Ctx = M.getContext();
10092 StructType *T;
10093 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10094 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10095#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10096#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10097 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10098 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10099#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10100 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10101 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10102#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10103 T = StructType::getTypeByName(Ctx, StructName); \
10104 if (!T) \
10105 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10106 VarName = T; \
10107 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10108#include "llvm/Frontend/OpenMP/OMPKinds.def"
10109}
10110
10111void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10113 SmallVectorImpl<BasicBlock *> &BlockVector) {
10115 BlockSet.insert(EntryBB);
10116 BlockSet.insert(ExitBB);
10117
10118 Worklist.push_back(EntryBB);
10119 while (!Worklist.empty()) {
10120 BasicBlock *BB = Worklist.pop_back_val();
10121 BlockVector.push_back(BB);
10122 for (BasicBlock *SuccBB : successors(BB))
10123 if (BlockSet.insert(SuccBB).second)
10124 Worklist.push_back(SuccBB);
10125 }
10126}
10127
10128void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10129 uint64_t Size, int32_t Flags,
10131 StringRef Name) {
10132 if (!Config.isGPU()) {
10135 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10136 return;
10137 }
10138 // TODO: Add support for global variables on the device after declare target
10139 // support.
10140 Function *Fn = dyn_cast<Function>(Addr);
10141 if (!Fn)
10142 return;
10143
10144 // Add a function attribute for the kernel.
10145 Fn->addFnAttr("kernel");
10146 if (T.isAMDGCN())
10147 Fn->addFnAttr("uniform-work-group-size", "true");
10148 Fn->addFnAttr(Attribute::MustProgress);
10149}
10150
10151// We only generate metadata for function that contain target regions.
10152void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10153 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10154
10155 // If there are no entries, we don't need to do anything.
10156 if (OffloadInfoManager.empty())
10157 return;
10158
10159 LLVMContext &C = M.getContext();
10160 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10161 TargetRegionEntryInfo>,
10162 16>
10163 OrderedEntries(OffloadInfoManager.size());
10164
10165 // Auxiliary methods to create metadata values and strings.
10166 auto &&GetMDInt = [this](unsigned V) {
10167 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10168 };
10169
10170 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10171
10172 // Create the offloading info metadata node.
10173 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10174 auto &&TargetRegionMetadataEmitter =
10175 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10176 const TargetRegionEntryInfo &EntryInfo,
10177 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10178 // Generate metadata for target regions. Each entry of this metadata
10179 // contains:
10180 // - Entry 0 -> Kind of this type of metadata (0).
10181 // - Entry 1 -> Device ID of the file where the entry was identified.
10182 // - Entry 2 -> File ID of the file where the entry was identified.
10183 // - Entry 3 -> Mangled name of the function where the entry was
10184 // identified.
10185 // - Entry 4 -> Line in the file where the entry was identified.
10186 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10187 // - Entry 6 -> Order the entry was created.
10188 // The first element of the metadata node is the kind.
10189 Metadata *Ops[] = {
10190 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10191 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10192 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10193 GetMDInt(E.getOrder())};
10194
10195 // Save this entry in the right position of the ordered entries array.
10196 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10197
10198 // Add metadata to the named metadata node.
10199 MD->addOperand(MDNode::get(C, Ops));
10200 };
10201
10202 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10203
10204 // Create function that emits metadata for each device global variable entry;
10205 auto &&DeviceGlobalVarMetadataEmitter =
10206 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10207 StringRef MangledName,
10208 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10209 // Generate metadata for global variables. Each entry of this metadata
10210 // contains:
10211 // - Entry 0 -> Kind of this type of metadata (1).
10212 // - Entry 1 -> Mangled name of the variable.
10213 // - Entry 2 -> Declare target kind.
10214 // - Entry 3 -> Order the entry was created.
10215 // The first element of the metadata node is the kind.
10216 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10217 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10218
10219 // Save this entry in the right position of the ordered entries array.
10220 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10221 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10222
10223 // Add metadata to the named metadata node.
10224 MD->addOperand(MDNode::get(C, Ops));
10225 };
10226
10227 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10228 DeviceGlobalVarMetadataEmitter);
10229
10230 for (const auto &E : OrderedEntries) {
10231 assert(E.first && "All ordered entries must exist!");
10232 if (const auto *CE =
10234 E.first)) {
10235 if (!CE->getID() || !CE->getAddress()) {
10236 // Do not blame the entry if the parent funtion is not emitted.
10237 TargetRegionEntryInfo EntryInfo = E.second;
10238 StringRef FnName = EntryInfo.ParentName;
10239 if (!M.getNamedValue(FnName))
10240 continue;
10241 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10242 continue;
10243 }
10244 createOffloadEntry(CE->getID(), CE->getAddress(),
10245 /*Size=*/0, CE->getFlags(),
10247 } else if (const auto *CE = dyn_cast<
10248 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10249 E.first)) {
10250 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10251 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10252 CE->getFlags());
10253 switch (Flags) {
10254 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10255 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10256 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10257 continue;
10258 if (!CE->getAddress()) {
10259 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10260 continue;
10261 }
10262 // The vaiable has no definition - no need to add the entry.
10263 if (CE->getVarSize() == 0)
10264 continue;
10265 break;
10266 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10267 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10268 (!Config.isTargetDevice() && CE->getAddress())) &&
10269 "Declaret target link address is set.");
10270 if (Config.isTargetDevice())
10271 continue;
10272 if (!CE->getAddress()) {
10273 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10274 continue;
10275 }
10276 break;
10277 default:
10278 break;
10279 }
10280
10281 // Hidden or internal symbols on the device are not externally visible.
10282 // We should not attempt to register them by creating an offloading
10283 // entry. Indirect variables are handled separately on the device.
10284 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10285 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10286 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10287 continue;
10288
10289 // Indirect globals need to use a special name that doesn't match the name
10290 // of the associated host global.
10291 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10292 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10293 Flags, CE->getLinkage(), CE->getVarName());
10294 else
10295 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10296 Flags, CE->getLinkage());
10297
10298 } else {
10299 llvm_unreachable("Unsupported entry kind.");
10300 }
10301 }
10302
10303 // Emit requires directive globals to a special entry so the runtime can
10304 // register them when the device image is loaded.
10305 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10306 // entries should be redesigned to better suit this use-case.
10307 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10311 ".requires", /*Size=*/0,
10312 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10313 Config.getRequiresFlags());
10314}
10315
10316void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10317 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10318 unsigned FileID, unsigned Line, unsigned Count) {
10319 raw_svector_ostream OS(Name);
10320 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10321 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10322 if (Count)
10323 OS << "_" << Count;
10324}
10325
10326void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10327 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10328 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10329 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10330 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10331 EntryInfo.Line, NewCount);
10332}
10333
10334TargetRegionEntryInfo
10335OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10336 vfs::FileSystem &VFS,
10337 StringRef ParentName) {
10338 sys::fs::UniqueID ID(0xdeadf17e, 0);
10339 auto FileIDInfo = CallBack();
10340 uint64_t FileID = 0;
10341 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10342 ID = Status->getUniqueID();
10343 FileID = Status->getUniqueID().getFile();
10344 } else {
10345 // If the inode ID could not be determined, create a hash value
10346 // the current file name and use that as an ID.
10347 FileID = hash_value(std::get<0>(FileIDInfo));
10348 }
10349
10350 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10351 std::get<1>(FileIDInfo));
10352}
10353
10354unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10355 unsigned Offset = 0;
10356 for (uint64_t Remain =
10357 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10359 !(Remain & 1); Remain = Remain >> 1)
10360 Offset++;
10361 return Offset;
10362}
10363
10365OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10366 // Rotate by getFlagMemberOffset() bits.
10367 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10368 << getFlagMemberOffset());
10369}
10370
10371void OpenMPIRBuilder::setCorrectMemberOfFlag(
10373 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10374 // If the entry is PTR_AND_OBJ but has not been marked with the special
10375 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10376 // marked as MEMBER_OF.
10377 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10379 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10382 return;
10383
10384 // Reset the placeholder value to prepare the flag for the assignment of the
10385 // proper MEMBER_OF value.
10386 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10387 Flags |= MemberOfFlag;
10388}
10389
10390Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10391 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10392 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10393 bool IsDeclaration, bool IsExternallyVisible,
10394 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10395 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10396 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10397 std::function<Constant *()> GlobalInitializer,
10398 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10399 // TODO: convert this to utilise the IRBuilder Config rather than
10400 // a passed down argument.
10401 if (OpenMPSIMD)
10402 return nullptr;
10403
10404 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10405 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10406 CaptureClause ==
10407 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10408 Config.hasRequiresUnifiedSharedMemory())) {
10409 SmallString<64> PtrName;
10410 {
10411 raw_svector_ostream OS(PtrName);
10412 OS << MangledName;
10413 if (!IsExternallyVisible)
10414 OS << format("_%x", EntryInfo.FileID);
10415 OS << "_decl_tgt_ref_ptr";
10416 }
10417
10418 Value *Ptr = M.getNamedValue(PtrName);
10419
10420 if (!Ptr) {
10421 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10422 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10423
10424 auto *GV = cast<GlobalVariable>(Ptr);
10425 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10426
10427 if (!Config.isTargetDevice()) {
10428 if (GlobalInitializer)
10429 GV->setInitializer(GlobalInitializer());
10430 else
10431 GV->setInitializer(GlobalValue);
10432 }
10433
10434 registerTargetGlobalVariable(
10435 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10436 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10437 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10438 }
10439
10440 return cast<Constant>(Ptr);
10441 }
10442
10443 return nullptr;
10444}
10445
10446void OpenMPIRBuilder::registerTargetGlobalVariable(
10447 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10448 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10449 bool IsDeclaration, bool IsExternallyVisible,
10450 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10451 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10452 std::vector<Triple> TargetTriple,
10453 std::function<Constant *()> GlobalInitializer,
10454 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10455 Constant *Addr) {
10456 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10457 (TargetTriple.empty() && !Config.isTargetDevice()))
10458 return;
10459
10460 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10462 int64_t VarSize;
10464
10465 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10466 CaptureClause ==
10467 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10468 !Config.hasRequiresUnifiedSharedMemory()) {
10469 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10470 VarName = MangledName;
10471 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10472
10473 if (!IsDeclaration)
10474 VarSize = divideCeil(
10475 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10476 else
10477 VarSize = 0;
10478 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10479
10480 // This is a workaround carried over from Clang which prevents undesired
10481 // optimisation of internal variables.
10482 if (Config.isTargetDevice() &&
10483 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10484 // Do not create a "ref-variable" if the original is not also available
10485 // on the host.
10486 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10487 return;
10488
10489 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10490
10491 if (!M.getNamedValue(RefName)) {
10492 Constant *AddrRef =
10493 getOrCreateInternalVariable(Addr->getType(), RefName);
10494 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10495 GvAddrRef->setConstant(true);
10496 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10497 GvAddrRef->setInitializer(Addr);
10498 GeneratedRefs.push_back(GvAddrRef);
10499 }
10500 }
10501 } else {
10502 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10503 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10504 else
10505 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10506
10507 if (Config.isTargetDevice()) {
10508 VarName = (Addr) ? Addr->getName() : "";
10509 Addr = nullptr;
10510 } else {
10511 Addr = getAddrOfDeclareTargetVar(
10512 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10513 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10514 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10515 VarName = (Addr) ? Addr->getName() : "";
10516 }
10517 VarSize = M.getDataLayout().getPointerSize();
10519 }
10520
10521 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10522 Flags, Linkage);
10523}
10524
10525/// Loads all the offload entries information from the host IR
10526/// metadata.
10527void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10528 // If we are in target mode, load the metadata from the host IR. This code has
10529 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10530
10531 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10532 if (!MD)
10533 return;
10534
10535 for (MDNode *MN : MD->operands()) {
10536 auto &&GetMDInt = [MN](unsigned Idx) {
10537 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10538 return cast<ConstantInt>(V->getValue())->getZExtValue();
10539 };
10540
10541 auto &&GetMDString = [MN](unsigned Idx) {
10542 auto *V = cast<MDString>(MN->getOperand(Idx));
10543 return V->getString();
10544 };
10545
10546 switch (GetMDInt(0)) {
10547 default:
10548 llvm_unreachable("Unexpected metadata!");
10549 break;
10550 case OffloadEntriesInfoManager::OffloadEntryInfo::
10551 OffloadingEntryInfoTargetRegion: {
10552 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10553 /*DeviceID=*/GetMDInt(1),
10554 /*FileID=*/GetMDInt(2),
10555 /*Line=*/GetMDInt(4),
10556 /*Count=*/GetMDInt(5));
10557 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10558 /*Order=*/GetMDInt(6));
10559 break;
10560 }
10561 case OffloadEntriesInfoManager::OffloadEntryInfo::
10562 OffloadingEntryInfoDeviceGlobalVar:
10563 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10564 /*MangledName=*/GetMDString(1),
10565 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10566 /*Flags=*/GetMDInt(2)),
10567 /*Order=*/GetMDInt(3));
10568 break;
10569 }
10570 }
10571}
10572
10573void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10574 StringRef HostFilePath) {
10575 if (HostFilePath.empty())
10576 return;
10577
10578 auto Buf = VFS.getBufferForFile(HostFilePath);
10579 if (std::error_code Err = Buf.getError()) {
10580 report_fatal_error(("error opening host file from host file path inside of "
10581 "OpenMPIRBuilder: " +
10582 Err.message())
10583 .c_str());
10584 }
10585
10586 LLVMContext Ctx;
10588 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10589 if (std::error_code Err = M.getError()) {
10591 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10592 .c_str());
10593 }
10594
10595 loadOffloadInfoMetadata(*M.get());
10596}
10597
10598//===----------------------------------------------------------------------===//
10599// OffloadEntriesInfoManager
10600//===----------------------------------------------------------------------===//
10601
10602bool OffloadEntriesInfoManager::empty() const {
10603 return OffloadEntriesTargetRegion.empty() &&
10604 OffloadEntriesDeviceGlobalVar.empty();
10605}
10606
10607unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10608 const TargetRegionEntryInfo &EntryInfo) const {
10609 auto It = OffloadEntriesTargetRegionCount.find(
10610 getTargetRegionEntryCountKey(EntryInfo));
10611 if (It == OffloadEntriesTargetRegionCount.end())
10612 return 0;
10613 return It->second;
10614}
10615
10616void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10617 const TargetRegionEntryInfo &EntryInfo) {
10618 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10619 EntryInfo.Count + 1;
10620}
10621
10622/// Initialize target region entry.
10623void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10624 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10625 OffloadEntriesTargetRegion[EntryInfo] =
10626 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10627 OMPTargetRegionEntryTargetRegion);
10628 ++OffloadingEntriesNum;
10629}
10630
10631void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10632 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10633 OMPTargetRegionEntryKind Flags) {
10634 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10635
10636 // Update the EntryInfo with the next available count for this location.
10637 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10638
10639 // If we are emitting code for a target, the entry is already initialized,
10640 // only has to be registered.
10641 if (OMPBuilder->Config.isTargetDevice()) {
10642 // This could happen if the device compilation is invoked standalone.
10643 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10644 return;
10645 }
10646 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10647 Entry.setAddress(Addr);
10648 Entry.setID(ID);
10649 Entry.setFlags(Flags);
10650 } else {
10651 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10652 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10653 return;
10654 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10655 "Target region entry already registered!");
10656 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10657 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10658 ++OffloadingEntriesNum;
10659 }
10660 incrementTargetRegionEntryInfoCount(EntryInfo);
10661}
10662
10663bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10664 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10665
10666 // Update the EntryInfo with the next available count for this location.
10667 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10668
10669 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10670 if (It == OffloadEntriesTargetRegion.end()) {
10671 return false;
10672 }
10673 // Fail if this entry is already registered.
10674 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10675 return false;
10676 return true;
10677}
10678
10679void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10680 const OffloadTargetRegionEntryInfoActTy &Action) {
10681 // Scan all target region entries and perform the provided action.
10682 for (const auto &It : OffloadEntriesTargetRegion) {
10683 Action(It.first, It.second);
10684 }
10685}
10686
10687void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10688 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10689 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10690 ++OffloadingEntriesNum;
10691}
10692
10693void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10694 StringRef VarName, Constant *Addr, int64_t VarSize,
10695 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10696 if (OMPBuilder->Config.isTargetDevice()) {
10697 // This could happen if the device compilation is invoked standalone.
10698 if (!hasDeviceGlobalVarEntryInfo(VarName))
10699 return;
10700 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10701 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10702 if (Entry.getVarSize() == 0) {
10703 Entry.setVarSize(VarSize);
10704 Entry.setLinkage(Linkage);
10705 }
10706 return;
10707 }
10708 Entry.setVarSize(VarSize);
10709 Entry.setLinkage(Linkage);
10710 Entry.setAddress(Addr);
10711 } else {
10712 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10713 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10714 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10715 "Entry not initialized!");
10716 if (Entry.getVarSize() == 0) {
10717 Entry.setVarSize(VarSize);
10718 Entry.setLinkage(Linkage);
10719 }
10720 return;
10721 }
10722 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10723 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10724 Addr, VarSize, Flags, Linkage,
10725 VarName.str());
10726 else
10727 OffloadEntriesDeviceGlobalVar.try_emplace(
10728 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10729 ++OffloadingEntriesNum;
10730 }
10731}
10732
10733void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10734 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10735 // Scan all target region entries and perform the provided action.
10736 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10737 Action(E.getKey(), E.getValue());
10738}
10739
10740//===----------------------------------------------------------------------===//
10741// CanonicalLoopInfo
10742//===----------------------------------------------------------------------===//
10743
10744void CanonicalLoopInfo::collectControlBlocks(
10746 // We only count those BBs as control block for which we do not need to
10747 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10748 // flow. For consistency, this also means we do not add the Body block, which
10749 // is just the entry to the body code.
10750 BBs.reserve(BBs.size() + 6);
10751 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10752}
10753
10754BasicBlock *CanonicalLoopInfo::getPreheader() const {
10755 assert(isValid() && "Requires a valid canonical loop");
10756 for (BasicBlock *Pred : predecessors(Header)) {
10757 if (Pred != Latch)
10758 return Pred;
10759 }
10760 llvm_unreachable("Missing preheader");
10761}
10762
10763void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10764 assert(isValid() && "Requires a valid canonical loop");
10765
10766 Instruction *CmpI = &getCond()->front();
10767 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10768 CmpI->setOperand(1, TripCount);
10769
10770#ifndef NDEBUG
10771 assertOK();
10772#endif
10773}
10774
10775void CanonicalLoopInfo::mapIndVar(
10776 llvm::function_ref<Value *(Instruction *)> Updater) {
10777 assert(isValid() && "Requires a valid canonical loop");
10778
10779 Instruction *OldIV = getIndVar();
10780
10781 // Record all uses excluding those introduced by the updater. Uses by the
10782 // CanonicalLoopInfo itself to keep track of the number of iterations are
10783 // excluded.
10784 SmallVector<Use *> ReplacableUses;
10785 for (Use &U : OldIV->uses()) {
10786 auto *User = dyn_cast<Instruction>(U.getUser());
10787 if (!User)
10788 continue;
10789 if (User->getParent() == getCond())
10790 continue;
10791 if (User->getParent() == getLatch())
10792 continue;
10793 ReplacableUses.push_back(&U);
10794 }
10795
10796 // Run the updater that may introduce new uses
10797 Value *NewIV = Updater(OldIV);
10798
10799 // Replace the old uses with the value returned by the updater.
10800 for (Use *U : ReplacableUses)
10801 U->set(NewIV);
10802
10803#ifndef NDEBUG
10804 assertOK();
10805#endif
10806}
10807
10808void CanonicalLoopInfo::assertOK() const {
10809#ifndef NDEBUG
10810 // No constraints if this object currently does not describe a loop.
10811 if (!isValid())
10812 return;
10813
10814 BasicBlock *Preheader = getPreheader();
10815 BasicBlock *Body = getBody();
10816 BasicBlock *After = getAfter();
10817
10818 // Verify standard control-flow we use for OpenMP loops.
10819 assert(Preheader);
10820 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10821 "Preheader must terminate with unconditional branch");
10822 assert(Preheader->getSingleSuccessor() == Header &&
10823 "Preheader must jump to header");
10824
10825 assert(Header);
10826 assert(isa<BranchInst>(Header->getTerminator()) &&
10827 "Header must terminate with unconditional branch");
10828 assert(Header->getSingleSuccessor() == Cond &&
10829 "Header must jump to exiting block");
10830
10831 assert(Cond);
10832 assert(Cond->getSinglePredecessor() == Header &&
10833 "Exiting block only reachable from header");
10834
10835 assert(isa<BranchInst>(Cond->getTerminator()) &&
10836 "Exiting block must terminate with conditional branch");
10837 assert(size(successors(Cond)) == 2 &&
10838 "Exiting block must have two successors");
10839 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10840 "Exiting block's first successor jump to the body");
10841 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10842 "Exiting block's second successor must exit the loop");
10843
10844 assert(Body);
10845 assert(Body->getSinglePredecessor() == Cond &&
10846 "Body only reachable from exiting block");
10847 assert(!isa<PHINode>(Body->front()));
10848
10849 assert(Latch);
10851 "Latch must terminate with unconditional branch");
10852 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10853 // TODO: To support simple redirecting of the end of the body code that has
10854 // multiple; introduce another auxiliary basic block like preheader and after.
10855 assert(Latch->getSinglePredecessor() != nullptr);
10856 assert(!isa<PHINode>(Latch->front()));
10857
10858 assert(Exit);
10859 assert(isa<BranchInst>(Exit->getTerminator()) &&
10860 "Exit block must terminate with unconditional branch");
10861 assert(Exit->getSingleSuccessor() == After &&
10862 "Exit block must jump to after block");
10863
10864 assert(After);
10865 assert(After->getSinglePredecessor() == Exit &&
10866 "After block only reachable from exit block");
10867 assert(After->empty() || !isa<PHINode>(After->front()));
10868
10869 Instruction *IndVar = getIndVar();
10870 assert(IndVar && "Canonical induction variable not found?");
10871 assert(isa<IntegerType>(IndVar->getType()) &&
10872 "Induction variable must be an integer");
10873 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10874 "Induction variable must be a PHI in the loop header");
10875 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10876 assert(
10877 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10878 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10879
10880 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10881 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10882 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10883 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10884 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10885 ->isOne());
10886
10887 Value *TripCount = getTripCount();
10888 assert(TripCount && "Loop trip count not found?");
10889 assert(IndVar->getType() == TripCount->getType() &&
10890 "Trip count and induction variable must have the same type");
10891
10892 auto *CmpI = cast<CmpInst>(&Cond->front());
10893 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10894 "Exit condition must be a signed less-than comparison");
10895 assert(CmpI->getOperand(0) == IndVar &&
10896 "Exit condition must compare the induction variable");
10897 assert(CmpI->getOperand(1) == TripCount &&
10898 "Exit condition must compare with the trip count");
10899#endif
10900}
10901
10902void CanonicalLoopInfo::invalidate() {
10903 Header = nullptr;
10904 Cond = nullptr;
10905 Latch = nullptr;
10906 Exit = nullptr;
10907}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:229
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1050
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1112
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:413
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1128
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:345
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...