LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/LLVMContext.h"
44#include "llvm/IR/MDBuilder.h"
45#include "llvm/IR/Metadata.h"
47#include "llvm/IR/PassManager.h"
49#include "llvm/IR/Value.h"
62
63#include <cstdint>
64#include <optional>
65
66#define DEBUG_TYPE "openmp-ir-builder"
67
68using namespace llvm;
69using namespace omp;
70
71static cl::opt<bool>
72 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
73 cl::desc("Use optimistic attributes describing "
74 "'as-if' properties of runtime calls."),
75 cl::init(false));
76
78 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
79 cl::desc("Factor for the unroll threshold to account for code "
80 "simplifications still taking place"),
81 cl::init(1.5));
82
83#ifndef NDEBUG
84/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
85/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
86/// an InsertPoint stores the instruction before something is inserted. For
87/// instance, if both point to the same instruction, two IRBuilders alternating
88/// creating instruction will cause the instructions to be interleaved.
91 if (!IP1.isSet() || !IP2.isSet())
92 return false;
93 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
94}
95
97 // Valid ordered/unordered and base algorithm combinations.
98 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
99 case OMPScheduleType::UnorderedStaticChunked:
100 case OMPScheduleType::UnorderedStatic:
101 case OMPScheduleType::UnorderedDynamicChunked:
102 case OMPScheduleType::UnorderedGuidedChunked:
103 case OMPScheduleType::UnorderedRuntime:
104 case OMPScheduleType::UnorderedAuto:
105 case OMPScheduleType::UnorderedTrapezoidal:
106 case OMPScheduleType::UnorderedGreedy:
107 case OMPScheduleType::UnorderedBalanced:
108 case OMPScheduleType::UnorderedGuidedIterativeChunked:
109 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
110 case OMPScheduleType::UnorderedSteal:
111 case OMPScheduleType::UnorderedStaticBalancedChunked:
112 case OMPScheduleType::UnorderedGuidedSimd:
113 case OMPScheduleType::UnorderedRuntimeSimd:
114 case OMPScheduleType::OrderedStaticChunked:
115 case OMPScheduleType::OrderedStatic:
116 case OMPScheduleType::OrderedDynamicChunked:
117 case OMPScheduleType::OrderedGuidedChunked:
118 case OMPScheduleType::OrderedRuntime:
119 case OMPScheduleType::OrderedAuto:
120 case OMPScheduleType::OrderdTrapezoidal:
121 case OMPScheduleType::NomergeUnorderedStaticChunked:
122 case OMPScheduleType::NomergeUnorderedStatic:
123 case OMPScheduleType::NomergeUnorderedDynamicChunked:
124 case OMPScheduleType::NomergeUnorderedGuidedChunked:
125 case OMPScheduleType::NomergeUnorderedRuntime:
126 case OMPScheduleType::NomergeUnorderedAuto:
127 case OMPScheduleType::NomergeUnorderedTrapezoidal:
128 case OMPScheduleType::NomergeUnorderedGreedy:
129 case OMPScheduleType::NomergeUnorderedBalanced:
130 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
131 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
132 case OMPScheduleType::NomergeUnorderedSteal:
133 case OMPScheduleType::NomergeOrderedStaticChunked:
134 case OMPScheduleType::NomergeOrderedStatic:
135 case OMPScheduleType::NomergeOrderedDynamicChunked:
136 case OMPScheduleType::NomergeOrderedGuidedChunked:
137 case OMPScheduleType::NomergeOrderedRuntime:
138 case OMPScheduleType::NomergeOrderedAuto:
139 case OMPScheduleType::NomergeOrderedTrapezoidal:
140 case OMPScheduleType::OrderedDistributeChunked:
141 case OMPScheduleType::OrderedDistribute:
142 break;
143 default:
144 return false;
145 }
146
147 // Must not set both monotonicity modifiers at the same time.
148 OMPScheduleType MonotonicityFlags =
149 SchedType & OMPScheduleType::MonotonicityMask;
150 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
151 return false;
152
153 return true;
154}
155#endif
156
157/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
158/// debug location to the last instruction in the specified basic block if the
159/// insert point points to the end of the block.
162 Builder.restoreIP(IP);
163 llvm::BasicBlock *BB = Builder.GetInsertBlock();
164 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
165 if (!BB->empty() && I == BB->end())
166 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
167}
168
169static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
170 if (T.isAMDGPU()) {
171 StringRef Features =
172 Kernel->getFnAttribute("target-features").getValueAsString();
173 if (Features.count("+wavefrontsize64"))
176 }
177 if (T.isNVPTX())
179 if (T.isSPIRV())
181 llvm_unreachable("No grid value available for this architecture!");
182}
183
184/// Determine which scheduling algorithm to use, determined from schedule clause
185/// arguments.
186static OMPScheduleType
187getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
188 bool HasSimdModifier, bool HasDistScheduleChunks) {
189 // Currently, the default schedule it static.
190 switch (ClauseKind) {
191 case OMP_SCHEDULE_Default:
192 case OMP_SCHEDULE_Static:
193 return HasChunks ? OMPScheduleType::BaseStaticChunked
194 : OMPScheduleType::BaseStatic;
195 case OMP_SCHEDULE_Dynamic:
196 return OMPScheduleType::BaseDynamicChunked;
197 case OMP_SCHEDULE_Guided:
198 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
199 : OMPScheduleType::BaseGuidedChunked;
200 case OMP_SCHEDULE_Auto:
202 case OMP_SCHEDULE_Runtime:
203 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
204 : OMPScheduleType::BaseRuntime;
205 case OMP_SCHEDULE_Distribute:
206 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
207 : OMPScheduleType::BaseDistribute;
208 }
209 llvm_unreachable("unhandled schedule clause argument");
210}
211
212/// Adds ordering modifier flags to schedule type.
213static OMPScheduleType
215 bool HasOrderedClause) {
216 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
217 OMPScheduleType::None &&
218 "Must not have ordering nor monotonicity flags already set");
219
220 OMPScheduleType OrderingModifier = HasOrderedClause
221 ? OMPScheduleType::ModifierOrdered
222 : OMPScheduleType::ModifierUnordered;
223 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
224
225 // Unsupported combinations
226 if (OrderingScheduleType ==
227 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
228 return OMPScheduleType::OrderedGuidedChunked;
229 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
230 OMPScheduleType::ModifierOrdered))
231 return OMPScheduleType::OrderedRuntime;
232
233 return OrderingScheduleType;
234}
235
236/// Adds monotonicity modifier flags to schedule type.
237static OMPScheduleType
239 bool HasSimdModifier, bool HasMonotonic,
240 bool HasNonmonotonic, bool HasOrderedClause) {
241 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
242 OMPScheduleType::None &&
243 "Must not have monotonicity flags already set");
244 assert((!HasMonotonic || !HasNonmonotonic) &&
245 "Monotonic and Nonmonotonic are contradicting each other");
246
247 if (HasMonotonic) {
248 return ScheduleType | OMPScheduleType::ModifierMonotonic;
249 } else if (HasNonmonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
251 } else {
252 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
253 // If the static schedule kind is specified or if the ordered clause is
254 // specified, and if the nonmonotonic modifier is not specified, the
255 // effect is as if the monotonic modifier is specified. Otherwise, unless
256 // the monotonic modifier is specified, the effect is as if the
257 // nonmonotonic modifier is specified.
258 OMPScheduleType BaseScheduleType =
259 ScheduleType & ~OMPScheduleType::ModifierMask;
260 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
261 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
262 HasOrderedClause) {
263 // The monotonic is used by default in openmp runtime library, so no need
264 // to set it.
265 return ScheduleType;
266 } else {
267 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
268 }
269 }
270}
271
272/// Determine the schedule type using schedule and ordering clause arguments.
273static OMPScheduleType
274computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
275 bool HasSimdModifier, bool HasMonotonicModifier,
276 bool HasNonmonotonicModifier, bool HasOrderedClause,
277 bool HasDistScheduleChunks) {
279 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
280 OMPScheduleType OrderedSchedule =
281 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
283 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
284 HasNonmonotonicModifier, HasOrderedClause);
285
287 return Result;
288}
289
290/// Make \p Source branch to \p Target.
291///
292/// Handles two situations:
293/// * \p Source already has an unconditional branch.
294/// * \p Source is a degenerate block (no terminator because the BB is
295/// the current head of the IR construction).
297 if (Instruction *Term = Source->getTerminator()) {
298 auto *Br = cast<BranchInst>(Term);
299 assert(!Br->isConditional() &&
300 "BB's terminator must be an unconditional branch (or degenerate)");
301 BasicBlock *Succ = Br->getSuccessor(0);
302 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
303 Br->setSuccessor(0, Target);
304 return;
305 }
306
307 auto *NewBr = BranchInst::Create(Target, Source);
308 NewBr->setDebugLoc(DL);
309}
310
311void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
312 bool CreateBranch, DebugLoc DL) {
313 assert(New->getFirstInsertionPt() == New->begin() &&
314 "Target BB must not have PHI nodes");
315
316 // Move instructions to new block.
317 BasicBlock *Old = IP.getBlock();
318 // If the `Old` block is empty then there are no instructions to move. But in
319 // the new debug scheme, it could have trailing debug records which will be
320 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
321 // reasons:
322 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
323 // 2. Even if `New` is not empty, the rationale to move those records to `New`
324 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
325 // assumes that `Old` is optimized out and is going away. This is not the case
326 // here. The `Old` block is still being used e.g. a branch instruction is
327 // added to it later in this function.
328 // So we call `BasicBlock::splice` only when `Old` is not empty.
329 if (!Old->empty())
330 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
331
332 if (CreateBranch) {
333 auto *NewBr = BranchInst::Create(New, Old);
334 NewBr->setDebugLoc(DL);
335 }
336}
337
338void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
339 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
340 BasicBlock *Old = Builder.GetInsertBlock();
341
342 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
343 if (CreateBranch)
344 Builder.SetInsertPoint(Old->getTerminator());
345 else
346 Builder.SetInsertPoint(Old);
347
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
350 Builder.SetCurrentDebugLocation(DebugLoc);
351}
352
353BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
354 DebugLoc DL, llvm::Twine Name) {
355 BasicBlock *Old = IP.getBlock();
357 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
358 Old->getParent(), Old->getNextNode());
359 spliceBB(IP, New, CreateBranch, DL);
360 New->replaceSuccessorsPhiUsesWith(Old, New);
361 return New;
362}
363
364BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
365 llvm::Twine Name) {
366 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
367 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
368 if (CreateBranch)
369 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
370 else
371 Builder.SetInsertPoint(Builder.GetInsertBlock());
372 // SetInsertPoint also updates the Builder's debug location, but we want to
373 // keep the one the Builder was configured to use.
374 Builder.SetCurrentDebugLocation(DebugLoc);
375 return New;
376}
377
378BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
379 llvm::Twine Name) {
380 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
381 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
382 if (CreateBranch)
383 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
384 else
385 Builder.SetInsertPoint(Builder.GetInsertBlock());
386 // SetInsertPoint also updates the Builder's debug location, but we want to
387 // keep the one the Builder was configured to use.
388 Builder.SetCurrentDebugLocation(DebugLoc);
389 return New;
390}
391
392BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
393 llvm::Twine Suffix) {
394 BasicBlock *Old = Builder.GetInsertBlock();
395 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
396}
397
398// This function creates a fake integer value and a fake use for the integer
399// value. It returns the fake value created. This is useful in modeling the
400// extra arguments to the outlined functions.
402 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
404 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
405 const Twine &Name = "", bool AsPtr = true) {
406 Builder.restoreIP(OuterAllocaIP);
407 Instruction *FakeVal;
408 AllocaInst *FakeValAddr =
409 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
410 ToBeDeleted.push_back(FakeValAddr);
411
412 if (AsPtr) {
413 FakeVal = FakeValAddr;
414 } else {
415 FakeVal =
416 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
417 ToBeDeleted.push_back(FakeVal);
418 }
419
420 // Generate a fake use of this value
421 Builder.restoreIP(InnerAllocaIP);
422 Instruction *UseFakeVal;
423 if (AsPtr) {
424 UseFakeVal =
425 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
426 } else {
427 UseFakeVal =
428 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
429 }
430 ToBeDeleted.push_back(UseFakeVal);
431 return FakeVal;
432}
433
434//===----------------------------------------------------------------------===//
435// OpenMPIRBuilderConfig
436//===----------------------------------------------------------------------===//
437
438namespace {
440/// Values for bit flags for marking which requires clauses have been used.
441enum OpenMPOffloadingRequiresDirFlags {
442 /// flag undefined.
443 OMP_REQ_UNDEFINED = 0x000,
444 /// no requires directive present.
445 OMP_REQ_NONE = 0x001,
446 /// reverse_offload clause.
447 OMP_REQ_REVERSE_OFFLOAD = 0x002,
448 /// unified_address clause.
449 OMP_REQ_UNIFIED_ADDRESS = 0x004,
450 /// unified_shared_memory clause.
451 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
452 /// dynamic_allocators clause.
453 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
454 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
455};
456
457} // anonymous namespace
458
459OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
460 : RequiresFlags(OMP_REQ_UNDEFINED) {}
461
462OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
463 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
464 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
465 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
466 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
467 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
468 RequiresFlags(OMP_REQ_UNDEFINED) {
469 if (HasRequiresReverseOffload)
470 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
471 if (HasRequiresUnifiedAddress)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 if (HasRequiresUnifiedSharedMemory)
474 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
475 if (HasRequiresDynamicAllocators)
476 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
477}
478
479bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
480 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
481}
482
483bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
484 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
485}
486
487bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
488 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
489}
490
491bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
492 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
493}
494
495int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
496 return hasRequiresFlags() ? RequiresFlags
497 : static_cast<int64_t>(OMP_REQ_NONE);
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
503 else
504 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
517 else
518 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
519}
520
521void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
522 if (Value)
523 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
524 else
525 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
526}
527
528//===----------------------------------------------------------------------===//
529// OpenMPIRBuilder
530//===----------------------------------------------------------------------===//
531
532void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
533 IRBuilderBase &Builder,
534 SmallVector<Value *> &ArgsVector) {
535 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
536 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
537 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
538 constexpr size_t MaxDim = 3;
539 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
540
541 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
542
543 Value *DynCGroupMemFallbackFlag =
544 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
545 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
546 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
547
548 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
549
550 Value *NumTeams3D =
551 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
552 Value *NumThreads3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
554 for (unsigned I :
555 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
556 NumTeams3D =
557 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
558 for (unsigned I :
559 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
560 NumThreads3D =
561 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
562
563 ArgsVector = {Version,
564 PointerNum,
565 KernelArgs.RTArgs.BasePointersArray,
566 KernelArgs.RTArgs.PointersArray,
567 KernelArgs.RTArgs.SizesArray,
568 KernelArgs.RTArgs.MapTypesArray,
569 KernelArgs.RTArgs.MapNamesArray,
570 KernelArgs.RTArgs.MappersArray,
571 KernelArgs.NumIterations,
572 Flags,
573 NumTeams3D,
574 NumThreads3D,
575 KernelArgs.DynCGroupMem};
576}
577
578void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
579 LLVMContext &Ctx = Fn.getContext();
580
581 // Get the function's current attributes.
582 auto Attrs = Fn.getAttributes();
583 auto FnAttrs = Attrs.getFnAttrs();
584 auto RetAttrs = Attrs.getRetAttrs();
586 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
587 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
588
589 // Add AS to FnAS while taking special care with integer extensions.
590 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
591 bool Param = true) -> void {
592 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
593 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
594 if (HasSignExt || HasZeroExt) {
595 assert(AS.getNumAttributes() == 1 &&
596 "Currently not handling extension attr combined with others.");
597 if (Param) {
598 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
599 FnAS = FnAS.addAttribute(Ctx, AK);
600 } else if (auto AK =
601 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
602 FnAS = FnAS.addAttribute(Ctx, AK);
603 } else {
604 FnAS = FnAS.addAttributes(Ctx, AS);
605 }
606 };
607
608#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
609#include "llvm/Frontend/OpenMP/OMPKinds.def"
610
611 // Add attributes to the function declaration.
612 switch (FnID) {
613#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
614 case Enum: \
615 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
616 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
617 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
618 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
619 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
620 break;
621#include "llvm/Frontend/OpenMP/OMPKinds.def"
622 default:
623 // Attributes are optional.
624 break;
625 }
626}
627
629OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
630 FunctionType *FnTy = nullptr;
631 Function *Fn = nullptr;
632
633 // Try to find the declation in the module first.
634 switch (FnID) {
635#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
636 case Enum: \
637 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
638 IsVarArg); \
639 Fn = M.getFunction(Str); \
640 break;
641#include "llvm/Frontend/OpenMP/OMPKinds.def"
642 }
643
644 if (!Fn) {
645 // Create a new declaration if we need one.
646 switch (FnID) {
647#define OMP_RTL(Enum, Str, ...) \
648 case Enum: \
649 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
650 break;
651#include "llvm/Frontend/OpenMP/OMPKinds.def"
652 }
653 Fn->setCallingConv(Config.getRuntimeCC());
654 // Add information if the runtime function takes a callback function
655 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
656 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
657 LLVMContext &Ctx = Fn->getContext();
658 MDBuilder MDB(Ctx);
659 // Annotate the callback behavior of the runtime function:
660 // - The callback callee is argument number 2 (microtask).
661 // - The first two arguments of the callback callee are unknown (-1).
662 // - All variadic arguments to the runtime function are passed to the
663 // callback callee.
664 Fn->addMetadata(
665 LLVMContext::MD_callback,
666 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
667 2, {-1, -1}, /* VarArgsArePassed */ true)}));
668 }
669 }
670
671 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
672 << " with type " << *Fn->getFunctionType() << "\n");
673 addAttributes(FnID, *Fn);
674
675 } else {
676 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 }
679
680 assert(Fn && "Failed to create OpenMP runtime function");
681
682 return {FnTy, Fn};
683}
684
686OpenMPIRBuilder::FinalizationInfo::getFiniBB(IRBuilderBase &Builder) {
687 if (!FiniBB) {
688 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
689 IRBuilderBase::InsertPointGuard Guard(Builder);
690 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
691 Builder.SetInsertPoint(FiniBB);
692 // FiniCB adds the branch to the exit stub.
693 if (Error Err = FiniCB(Builder.saveIP()))
694 return Err;
695 }
696 return FiniBB;
697}
698
699Error OpenMPIRBuilder::FinalizationInfo::mergeFiniBB(IRBuilderBase &Builder,
700 BasicBlock *OtherFiniBB) {
701 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
702 if (!FiniBB) {
703 FiniBB = OtherFiniBB;
704
705 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
706 if (Error Err = FiniCB(Builder.saveIP()))
707 return Err;
708
709 return Error::success();
710 }
711
712 // Move instructions from FiniBB to the start of OtherFiniBB.
713 auto EndIt = FiniBB->end();
714 if (FiniBB->size() >= 1)
715 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
716 EndIt = Prev;
717 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
718 EndIt);
719
720 FiniBB->replaceAllUsesWith(OtherFiniBB);
721 FiniBB->eraseFromParent();
722 FiniBB = OtherFiniBB;
723 return Error::success();
724}
725
726Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
727 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
728 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
729 assert(Fn && "Failed to create OpenMP runtime function pointer");
730 return Fn;
731}
732
733CallInst *OpenMPIRBuilder::createRuntimeFunctionCall(FunctionCallee Callee,
735 StringRef Name) {
736 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
737 Call->setCallingConv(Config.getRuntimeCC());
738 return Call;
739}
740
741void OpenMPIRBuilder::initialize() { initializeTypes(M); }
742
745 BasicBlock &EntryBlock = Function->getEntryBlock();
746 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
747
748 // Loop over blocks looking for constant allocas, skipping the entry block
749 // as any allocas there are already in the desired location.
750 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
751 Block++) {
752 for (auto Inst = Block->getReverseIterator()->begin();
753 Inst != Block->getReverseIterator()->end();) {
755 Inst++;
757 continue;
758 AllocaInst->moveBeforePreserving(MoveLocInst);
759 } else {
760 Inst++;
761 }
762 }
763 }
764}
765
766void OpenMPIRBuilder::finalize(Function *Fn) {
767 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
769 SmallVector<OutlineInfo, 16> DeferredOutlines;
770 for (OutlineInfo &OI : OutlineInfos) {
771 // Skip functions that have not finalized yet; may happen with nested
772 // function generation.
773 if (Fn && OI.getFunction() != Fn) {
774 DeferredOutlines.push_back(OI);
775 continue;
776 }
777
778 ParallelRegionBlockSet.clear();
779 Blocks.clear();
780 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
781
782 Function *OuterFn = OI.getFunction();
783 CodeExtractorAnalysisCache CEAC(*OuterFn);
784 // If we generate code for the target device, we need to allocate
785 // struct for aggregate params in the device default alloca address space.
786 // OpenMP runtime requires that the params of the extracted functions are
787 // passed as zero address space pointers. This flag ensures that
788 // CodeExtractor generates correct code for extracted functions
789 // which are used by OpenMP runtime.
790 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
791 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
792 /* AggregateArgs */ true,
793 /* BlockFrequencyInfo */ nullptr,
794 /* BranchProbabilityInfo */ nullptr,
795 /* AssumptionCache */ nullptr,
796 /* AllowVarArgs */ true,
797 /* AllowAlloca */ true,
798 /* AllocaBlock*/ OI.OuterAllocaBB,
799 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
800
801 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
802 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
803 << " Exit: " << OI.ExitBB->getName() << "\n");
804 assert(Extractor.isEligible() &&
805 "Expected OpenMP outlining to be possible!");
806
807 for (auto *V : OI.ExcludeArgsFromAggregate)
808 Extractor.excludeArgFromAggregate(V);
809
810 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
811
812 // Forward target-cpu, target-features attributes to the outlined function.
813 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
814 if (TargetCpuAttr.isStringAttribute())
815 OutlinedFn->addFnAttr(TargetCpuAttr);
816
817 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
818 if (TargetFeaturesAttr.isStringAttribute())
819 OutlinedFn->addFnAttr(TargetFeaturesAttr);
820
821 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
822 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
823 assert(OutlinedFn->getReturnType()->isVoidTy() &&
824 "OpenMP outlined functions should not return a value!");
825
826 // For compability with the clang CG we move the outlined function after the
827 // one with the parallel region.
828 OutlinedFn->removeFromParent();
829 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
830
831 // Remove the artificial entry introduced by the extractor right away, we
832 // made our own entry block after all.
833 {
834 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
835 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
836 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
837 // Move instructions from the to-be-deleted ArtificialEntry to the entry
838 // basic block of the parallel region. CodeExtractor generates
839 // instructions to unwrap the aggregate argument and may sink
840 // allocas/bitcasts for values that are solely used in the outlined region
841 // and do not escape.
842 assert(!ArtificialEntry.empty() &&
843 "Expected instructions to add in the outlined region entry");
844 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
845 End = ArtificialEntry.rend();
846 It != End;) {
847 Instruction &I = *It;
848 It++;
849
850 if (I.isTerminator()) {
851 // Absorb any debug value that terminator may have
852 if (OI.EntryBB->getTerminator())
853 OI.EntryBB->getTerminator()->adoptDbgRecords(
854 &ArtificialEntry, I.getIterator(), false);
855 continue;
856 }
857
858 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
859 }
860
861 OI.EntryBB->moveBefore(&ArtificialEntry);
862 ArtificialEntry.eraseFromParent();
863 }
864 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
865 assert(OutlinedFn && OutlinedFn->hasNUses(1));
866
867 // Run a user callback, e.g. to add attributes.
868 if (OI.PostOutlineCB)
869 OI.PostOutlineCB(*OutlinedFn);
870 }
871
872 // Remove work items that have been completed.
873 OutlineInfos = std::move(DeferredOutlines);
874
875 // The createTarget functions embeds user written code into
876 // the target region which may inject allocas which need to
877 // be moved to the entry block of our target or risk malformed
878 // optimisations by later passes, this is only relevant for
879 // the device pass which appears to be a little more delicate
880 // when it comes to optimisations (however, we do not block on
881 // that here, it's up to the inserter to the list to do so).
882 // This notbaly has to occur after the OutlinedInfo candidates
883 // have been extracted so we have an end product that will not
884 // be implicitly adversely affected by any raises unless
885 // intentionally appended to the list.
886 // NOTE: This only does so for ConstantData, it could be extended
887 // to ConstantExpr's with further effort, however, they should
888 // largely be folded when they get here. Extending it to runtime
889 // defined/read+writeable allocation sizes would be non-trivial
890 // (need to factor in movement of any stores to variables the
891 // allocation size depends on, as well as the usual loads,
892 // otherwise it'll yield the wrong result after movement) and
893 // likely be more suitable as an LLVM optimisation pass.
894 for (Function *F : ConstantAllocaRaiseCandidates)
896
897 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
898 [](EmitMetadataErrorKind Kind,
899 const TargetRegionEntryInfo &EntryInfo) -> void {
900 errs() << "Error of kind: " << Kind
901 << " when emitting offload entries and metadata during "
902 "OMPIRBuilder finalization \n";
903 };
904
905 if (!OffloadInfoManager.empty())
906 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
907
908 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
909 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
910 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
911 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
912 }
913
914 IsFinalized = true;
915}
916
917bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
918
919OpenMPIRBuilder::~OpenMPIRBuilder() {
920 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
921}
922
923GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
924 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
925 auto *GV =
926 new GlobalVariable(M, I32Ty,
927 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
928 ConstantInt::get(I32Ty, Value), Name);
929 GV->setVisibility(GlobalValue::HiddenVisibility);
930
931 return GV;
932}
933
934void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
935 if (List.empty())
936 return;
937
938 // Convert List to what ConstantArray needs.
940 UsedArray.resize(List.size());
941 for (unsigned I = 0, E = List.size(); I != E; ++I)
943 cast<Constant>(&*List[I]), Builder.getPtrTy());
944
945 if (UsedArray.empty())
946 return;
947 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
948
949 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
950 ConstantArray::get(ATy, UsedArray), Name);
951
952 GV->setSection("llvm.metadata");
953}
954
956OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
958 auto *Int8Ty = Builder.getInt8Ty();
959 auto *GVMode = new GlobalVariable(
960 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
961 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
962 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
963 return GVMode;
964}
965
966Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
967 uint32_t SrcLocStrSize,
968 IdentFlag LocFlags,
969 unsigned Reserve2Flags) {
970 // Enable "C-mode".
971 LocFlags |= OMP_IDENT_FLAG_KMPC;
972
973 Constant *&Ident =
974 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
975 if (!Ident) {
976 Constant *I32Null = ConstantInt::getNullValue(Int32);
977 Constant *IdentData[] = {I32Null,
978 ConstantInt::get(Int32, uint32_t(LocFlags)),
979 ConstantInt::get(Int32, Reserve2Flags),
980 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
981
982 size_t SrcLocStrArgIdx = 4;
983 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
985 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
986 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
987 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
988 Constant *Initializer =
989 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
990
991 // Look for existing encoding of the location + flags, not needed but
992 // minimizes the difference to the existing solution while we transition.
993 for (GlobalVariable &GV : M.globals())
994 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
995 if (GV.getInitializer() == Initializer)
996 Ident = &GV;
997
998 if (!Ident) {
999 auto *GV = new GlobalVariable(
1000 M, OpenMPIRBuilder::Ident,
1001 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1003 M.getDataLayout().getDefaultGlobalsAddressSpace());
1004 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1005 GV->setAlignment(Align(8));
1006 Ident = GV;
1007 }
1008 }
1009
1010 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1011}
1012
1013Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
1014 uint32_t &SrcLocStrSize) {
1015 SrcLocStrSize = LocStr.size();
1016 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1017 if (!SrcLocStr) {
1018 Constant *Initializer =
1019 ConstantDataArray::getString(M.getContext(), LocStr);
1020
1021 // Look for existing encoding of the location, not needed but minimizes the
1022 // difference to the existing solution while we transition.
1023 for (GlobalVariable &GV : M.globals())
1024 if (GV.isConstant() && GV.hasInitializer() &&
1025 GV.getInitializer() == Initializer)
1026 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1027
1028 SrcLocStr = Builder.CreateGlobalString(
1029 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1030 &M);
1031 }
1032 return SrcLocStr;
1033}
1034
1035Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
1036 StringRef FileName,
1037 unsigned Line, unsigned Column,
1038 uint32_t &SrcLocStrSize) {
1039 SmallString<128> Buffer;
1040 Buffer.push_back(';');
1041 Buffer.append(FileName);
1042 Buffer.push_back(';');
1043 Buffer.append(FunctionName);
1044 Buffer.push_back(';');
1045 Buffer.append(std::to_string(Line));
1046 Buffer.push_back(';');
1047 Buffer.append(std::to_string(Column));
1048 Buffer.push_back(';');
1049 Buffer.push_back(';');
1050 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1051}
1052
1053Constant *
1054OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
1055 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1056 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1057}
1058
1059Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
1060 uint32_t &SrcLocStrSize,
1061 Function *F) {
1062 DILocation *DIL = DL.get();
1063 if (!DIL)
1064 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1065 StringRef FileName = M.getName();
1066 if (DIFile *DIF = DIL->getFile())
1067 if (std::optional<StringRef> Source = DIF->getSource())
1068 FileName = *Source;
1069 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1070 if (Function.empty() && F)
1071 Function = F->getName();
1072 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1073 DIL->getColumn(), SrcLocStrSize);
1074}
1075
1076Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1077 uint32_t &SrcLocStrSize) {
1078 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1079 Loc.IP.getBlock()->getParent());
1080}
1081
1082Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1083 return createRuntimeFunctionCall(
1084 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1085 "omp_global_thread_num");
1086}
1087
1088OpenMPIRBuilder::InsertPointOrErrorTy
1089OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1090 bool ForceSimpleCall, bool CheckCancelFlag) {
1091 if (!updateToLocation(Loc))
1092 return Loc.IP;
1093
1094 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1095 // __kmpc_barrier(loc, thread_id);
1096
1097 IdentFlag BarrierLocFlags;
1098 switch (Kind) {
1099 case OMPD_for:
1100 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1101 break;
1102 case OMPD_sections:
1103 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1104 break;
1105 case OMPD_single:
1106 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1107 break;
1108 case OMPD_barrier:
1109 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1110 break;
1111 default:
1112 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1113 break;
1114 }
1115
1116 uint32_t SrcLocStrSize;
1117 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1118 Value *Args[] = {
1119 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1120 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1121
1122 // If we are in a cancellable parallel region, barriers are cancellation
1123 // points.
1124 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1125 bool UseCancelBarrier =
1126 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1127
1128 Value *Result = createRuntimeFunctionCall(
1129 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1130 ? OMPRTL___kmpc_cancel_barrier
1131 : OMPRTL___kmpc_barrier),
1132 Args);
1133
1134 if (UseCancelBarrier && CheckCancelFlag)
1135 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1136 return Err;
1137
1138 return Builder.saveIP();
1139}
1140
1141OpenMPIRBuilder::InsertPointOrErrorTy
1142OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1143 Value *IfCondition,
1144 omp::Directive CanceledDirective) {
1145 if (!updateToLocation(Loc))
1146 return Loc.IP;
1147
1148 // LLVM utilities like blocks with terminators.
1149 auto *UI = Builder.CreateUnreachable();
1150
1151 Instruction *ThenTI = UI, *ElseTI = nullptr;
1152 if (IfCondition)
1153 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1154 Builder.SetInsertPoint(ThenTI);
1155
1156 Value *CancelKind = nullptr;
1157 switch (CanceledDirective) {
1158#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1159 case DirectiveEnum: \
1160 CancelKind = Builder.getInt32(Value); \
1161 break;
1162#include "llvm/Frontend/OpenMP/OMPKinds.def"
1163 default:
1164 llvm_unreachable("Unknown cancel kind!");
1165 }
1166
1167 uint32_t SrcLocStrSize;
1168 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1169 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1170 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1171 Value *Result = createRuntimeFunctionCall(
1172 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1173
1174 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1175 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1176 return Err;
1177
1178 // Update the insertion point and remove the terminator we introduced.
1179 Builder.SetInsertPoint(UI->getParent());
1180 UI->eraseFromParent();
1181
1182 return Builder.saveIP();
1183}
1184
1185OpenMPIRBuilder::InsertPointOrErrorTy
1186OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1187 omp::Directive CanceledDirective) {
1188 if (!updateToLocation(Loc))
1189 return Loc.IP;
1190
1191 // LLVM utilities like blocks with terminators.
1192 auto *UI = Builder.CreateUnreachable();
1193 Builder.SetInsertPoint(UI);
1194
1195 Value *CancelKind = nullptr;
1196 switch (CanceledDirective) {
1197#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1198 case DirectiveEnum: \
1199 CancelKind = Builder.getInt32(Value); \
1200 break;
1201#include "llvm/Frontend/OpenMP/OMPKinds.def"
1202 default:
1203 llvm_unreachable("Unknown cancel kind!");
1204 }
1205
1206 uint32_t SrcLocStrSize;
1207 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1208 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1209 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1210 Value *Result = createRuntimeFunctionCall(
1211 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1212
1213 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1214 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1215 return Err;
1216
1217 // Update the insertion point and remove the terminator we introduced.
1218 Builder.SetInsertPoint(UI->getParent());
1219 UI->eraseFromParent();
1220
1221 return Builder.saveIP();
1222}
1223
1224OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1225 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1226 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1227 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1228 if (!updateToLocation(Loc))
1229 return Loc.IP;
1230
1231 Builder.restoreIP(AllocaIP);
1232 auto *KernelArgsPtr =
1233 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1234 updateToLocation(Loc);
1235
1236 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1237 llvm::Value *Arg =
1238 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1239 Builder.CreateAlignedStore(
1240 KernelArgs[I], Arg,
1241 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1242 }
1243
1244 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1245 NumThreads, HostPtr, KernelArgsPtr};
1246
1247 Return = createRuntimeFunctionCall(
1248 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1249 OffloadingArgs);
1250
1251 return Builder.saveIP();
1252}
1253
1254OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1255 const LocationDescription &Loc, Value *OutlinedFnID,
1256 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1257 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1258
1259 if (!updateToLocation(Loc))
1260 return Loc.IP;
1261
1262 // On top of the arrays that were filled up, the target offloading call
1263 // takes as arguments the device id as well as the host pointer. The host
1264 // pointer is used by the runtime library to identify the current target
1265 // region, so it only has to be unique and not necessarily point to
1266 // anything. It could be the pointer to the outlined function that
1267 // implements the target region, but we aren't using that so that the
1268 // compiler doesn't need to keep that, and could therefore inline the host
1269 // function if proven worthwhile during optimization.
1270
1271 // From this point on, we need to have an ID of the target region defined.
1272 assert(OutlinedFnID && "Invalid outlined function ID!");
1273 (void)OutlinedFnID;
1274
1275 // Return value of the runtime offloading call.
1276 Value *Return = nullptr;
1277
1278 // Arguments for the target kernel.
1279 SmallVector<Value *> ArgsVector;
1280 getKernelArgsVector(Args, Builder, ArgsVector);
1281
1282 // The target region is an outlined function launched by the runtime
1283 // via calls to __tgt_target_kernel().
1284 //
1285 // Note that on the host and CPU targets, the runtime implementation of
1286 // these calls simply call the outlined function without forking threads.
1287 // The outlined functions themselves have runtime calls to
1288 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1289 // the compiler in emitTeamsCall() and emitParallelCall().
1290 //
1291 // In contrast, on the NVPTX target, the implementation of
1292 // __tgt_target_teams() launches a GPU kernel with the requested number
1293 // of teams and threads so no additional calls to the runtime are required.
1294 // Check the error code and execute the host version if required.
1295 Builder.restoreIP(emitTargetKernel(
1296 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1297 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1298
1299 BasicBlock *OffloadFailedBlock =
1300 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1301 BasicBlock *OffloadContBlock =
1302 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1303 Value *Failed = Builder.CreateIsNotNull(Return);
1304 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1305
1306 auto CurFn = Builder.GetInsertBlock()->getParent();
1307 emitBlock(OffloadFailedBlock, CurFn);
1308 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1309 if (!AfterIP)
1310 return AfterIP.takeError();
1311 Builder.restoreIP(*AfterIP);
1312 emitBranch(OffloadContBlock);
1313 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1314 return Builder.saveIP();
1315}
1316
1317Error OpenMPIRBuilder::emitCancelationCheckImpl(
1318 Value *CancelFlag, omp::Directive CanceledDirective) {
1319 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1320 "Unexpected cancellation!");
1321
1322 // For a cancel barrier we create two new blocks.
1323 BasicBlock *BB = Builder.GetInsertBlock();
1324 BasicBlock *NonCancellationBlock;
1325 if (Builder.GetInsertPoint() == BB->end()) {
1326 // TODO: This branch will not be needed once we moved to the
1327 // OpenMPIRBuilder codegen completely.
1328 NonCancellationBlock = BasicBlock::Create(
1329 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1330 } else {
1331 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1333 Builder.SetInsertPoint(BB);
1334 }
1335 BasicBlock *CancellationBlock = BasicBlock::Create(
1336 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1337
1338 // Jump to them based on the return value.
1339 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1340 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1341 /* TODO weight */ nullptr, nullptr);
1342
1343 // From the cancellation block we finalize all variables and go to the
1344 // post finalization block that is known to the FiniCB callback.
1345 auto &FI = FinalizationStack.back();
1346 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1347 if (!FiniBBOrErr)
1348 return FiniBBOrErr.takeError();
1349 Builder.SetInsertPoint(CancellationBlock);
1350 Builder.CreateBr(*FiniBBOrErr);
1351
1352 // The continuation block is where code generation continues.
1353 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1354 return Error::success();
1355}
1356
1357// Callback used to create OpenMP runtime calls to support
1358// omp parallel clause for the device.
1359// We need to use this callback to replace call to the OutlinedFn in OuterFn
1360// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1362 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1363 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1364 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1365 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1366 // Add some known attributes.
1367 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1368 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1369 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1370 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1371 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1372 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1373
1374 assert(OutlinedFn.arg_size() >= 2 &&
1375 "Expected at least tid and bounded tid as arguments");
1376 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1377
1378 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1379 assert(CI && "Expected call instruction to outlined function");
1380 CI->getParent()->setName("omp_parallel");
1381
1382 Builder.SetInsertPoint(CI);
1383 Type *PtrTy = OMPIRBuilder->VoidPtr;
1384 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1385
1386 // Add alloca for kernel args
1387 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1388 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1389 AllocaInst *ArgsAlloca =
1390 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1391 Value *Args = ArgsAlloca;
1392 // Add address space cast if array for storing arguments is not allocated
1393 // in address space 0
1394 if (ArgsAlloca->getAddressSpace())
1395 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1396 Builder.restoreIP(CurrentIP);
1397
1398 // Store captured vars which are used by kmpc_parallel_51
1399 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1400 Value *V = *(CI->arg_begin() + 2 + Idx);
1401 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1402 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1403 Builder.CreateStore(V, StoreAddress);
1404 }
1405
1406 Value *Cond =
1407 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1408 : Builder.getInt32(1);
1409
1410 // Build kmpc_parallel_51 call
1411 Value *Parallel51CallArgs[] = {
1412 /* identifier*/ Ident,
1413 /* global thread num*/ ThreadID,
1414 /* if expression */ Cond,
1415 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1416 /* Proc bind */ Builder.getInt32(-1),
1417 /* outlined function */ &OutlinedFn,
1418 /* wrapper function */ NullPtrValue,
1419 /* arguments of the outlined funciton*/ Args,
1420 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1421
1422 FunctionCallee RTLFn =
1423 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1424
1425 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel51CallArgs);
1426
1427 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1428 << *Builder.GetInsertBlock()->getParent() << "\n");
1429
1430 // Initialize the local TID stack location with the argument value.
1431 Builder.SetInsertPoint(PrivTID);
1432 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1433 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1434 PrivTIDAddr);
1435
1436 // Remove redundant call to the outlined function.
1437 CI->eraseFromParent();
1438
1439 for (Instruction *I : ToBeDeleted) {
1440 I->eraseFromParent();
1441 }
1442}
1443
1444// Callback used to create OpenMP runtime calls to support
1445// omp parallel clause for the host.
1446// We need to use this callback to replace call to the OutlinedFn in OuterFn
1447// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1448static void
1449hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1450 Function *OuterFn, Value *Ident, Value *IfCondition,
1451 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1452 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1453 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1454 FunctionCallee RTLFn;
1455 if (IfCondition) {
1456 RTLFn =
1457 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1458 } else {
1459 RTLFn =
1460 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1461 }
1462 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1463 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1464 LLVMContext &Ctx = F->getContext();
1465 MDBuilder MDB(Ctx);
1466 // Annotate the callback behavior of the __kmpc_fork_call:
1467 // - The callback callee is argument number 2 (microtask).
1468 // - The first two arguments of the callback callee are unknown (-1).
1469 // - All variadic arguments to the __kmpc_fork_call are passed to the
1470 // callback callee.
1471 F->addMetadata(LLVMContext::MD_callback,
1473 2, {-1, -1},
1474 /* VarArgsArePassed */ true)}));
1475 }
1476 }
1477 // Add some known attributes.
1478 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1479 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1480 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1481
1482 assert(OutlinedFn.arg_size() >= 2 &&
1483 "Expected at least tid and bounded tid as arguments");
1484 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1485
1486 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1487 CI->getParent()->setName("omp_parallel");
1488 Builder.SetInsertPoint(CI);
1489
1490 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1491 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1492 &OutlinedFn};
1493
1494 SmallVector<Value *, 16> RealArgs;
1495 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1496 if (IfCondition) {
1497 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1498 RealArgs.push_back(Cond);
1499 }
1500 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1501
1502 // __kmpc_fork_call_if always expects a void ptr as the last argument
1503 // If there are no arguments, pass a null pointer.
1504 auto PtrTy = OMPIRBuilder->VoidPtr;
1505 if (IfCondition && NumCapturedVars == 0) {
1506 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1507 RealArgs.push_back(NullPtrValue);
1508 }
1509
1510 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1511
1512 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1513 << *Builder.GetInsertBlock()->getParent() << "\n");
1514
1515 // Initialize the local TID stack location with the argument value.
1516 Builder.SetInsertPoint(PrivTID);
1517 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1518 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1519 PrivTIDAddr);
1520
1521 // Remove redundant call to the outlined function.
1522 CI->eraseFromParent();
1523
1524 for (Instruction *I : ToBeDeleted) {
1525 I->eraseFromParent();
1526 }
1527}
1528
1529OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1530 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1531 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1532 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1533 omp::ProcBindKind ProcBind, bool IsCancellable) {
1534 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1535
1536 if (!updateToLocation(Loc))
1537 return Loc.IP;
1538
1539 uint32_t SrcLocStrSize;
1540 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1541 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1542 Value *ThreadID = getOrCreateThreadID(Ident);
1543 // If we generate code for the target device, we need to allocate
1544 // struct for aggregate params in the device default alloca address space.
1545 // OpenMP runtime requires that the params of the extracted functions are
1546 // passed as zero address space pointers. This flag ensures that extracted
1547 // function arguments are declared in zero address space
1548 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1549
1550 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1551 // only if we compile for host side.
1552 if (NumThreads && !Config.isTargetDevice()) {
1553 Value *Args[] = {
1554 Ident, ThreadID,
1555 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1556 createRuntimeFunctionCall(
1557 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1558 }
1559
1560 if (ProcBind != OMP_PROC_BIND_default) {
1561 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1562 Value *Args[] = {
1563 Ident, ThreadID,
1564 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1565 createRuntimeFunctionCall(
1566 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1567 }
1568
1569 BasicBlock *InsertBB = Builder.GetInsertBlock();
1570 Function *OuterFn = InsertBB->getParent();
1571
1572 // Save the outer alloca block because the insertion iterator may get
1573 // invalidated and we still need this later.
1574 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1575
1576 // Vector to remember instructions we used only during the modeling but which
1577 // we want to delete at the end.
1579
1580 // Change the location to the outer alloca insertion point to create and
1581 // initialize the allocas we pass into the parallel region.
1582 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1583 Builder.restoreIP(NewOuter);
1584 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1585 AllocaInst *ZeroAddrAlloca =
1586 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1587 Instruction *TIDAddr = TIDAddrAlloca;
1588 Instruction *ZeroAddr = ZeroAddrAlloca;
1589 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1590 // Add additional casts to enforce pointers in zero address space
1591 TIDAddr = new AddrSpaceCastInst(
1592 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1593 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1594 ToBeDeleted.push_back(TIDAddr);
1595 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1596 PointerType ::get(M.getContext(), 0),
1597 "zero.addr.ascast");
1598 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1599 ToBeDeleted.push_back(ZeroAddr);
1600 }
1601
1602 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1603 // associated arguments in the outlined function, so we delete them later.
1604 ToBeDeleted.push_back(TIDAddrAlloca);
1605 ToBeDeleted.push_back(ZeroAddrAlloca);
1606
1607 // Create an artificial insertion point that will also ensure the blocks we
1608 // are about to split are not degenerated.
1609 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1610
1611 BasicBlock *EntryBB = UI->getParent();
1612 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1613 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1614 BasicBlock *PRegPreFiniBB =
1615 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1616 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1617
1618 auto FiniCBWrapper = [&](InsertPointTy IP) {
1619 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1620 // target to the region exit block.
1621 if (IP.getBlock()->end() == IP.getPoint()) {
1622 IRBuilder<>::InsertPointGuard IPG(Builder);
1623 Builder.restoreIP(IP);
1624 Instruction *I = Builder.CreateBr(PRegExitBB);
1625 IP = InsertPointTy(I->getParent(), I->getIterator());
1626 }
1628 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1629 "Unexpected insertion point for finalization call!");
1630 return FiniCB(IP);
1631 };
1632
1633 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1634
1635 // Generate the privatization allocas in the block that will become the entry
1636 // of the outlined function.
1637 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1638 InsertPointTy InnerAllocaIP = Builder.saveIP();
1639
1640 AllocaInst *PrivTIDAddr =
1641 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1642 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1643
1644 // Add some fake uses for OpenMP provided arguments.
1645 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1646 Instruction *ZeroAddrUse =
1647 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1648 ToBeDeleted.push_back(ZeroAddrUse);
1649
1650 // EntryBB
1651 // |
1652 // V
1653 // PRegionEntryBB <- Privatization allocas are placed here.
1654 // |
1655 // V
1656 // PRegionBodyBB <- BodeGen is invoked here.
1657 // |
1658 // V
1659 // PRegPreFiniBB <- The block we will start finalization from.
1660 // |
1661 // V
1662 // PRegionExitBB <- A common exit to simplify block collection.
1663 //
1664
1665 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1666
1667 // Let the caller create the body.
1668 assert(BodyGenCB && "Expected body generation callback!");
1669 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1670 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1671 return Err;
1672
1673 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1674
1675 OutlineInfo OI;
1676 if (Config.isTargetDevice()) {
1677 // Generate OpenMP target specific runtime call
1678 OI.PostOutlineCB = [=, ToBeDeletedVec =
1679 std::move(ToBeDeleted)](Function &OutlinedFn) {
1680 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1681 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1682 ThreadID, ToBeDeletedVec);
1683 };
1684 } else {
1685 // Generate OpenMP host runtime call
1686 OI.PostOutlineCB = [=, ToBeDeletedVec =
1687 std::move(ToBeDeleted)](Function &OutlinedFn) {
1688 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1689 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1690 };
1691 }
1692
1693 OI.OuterAllocaBB = OuterAllocaBlock;
1694 OI.EntryBB = PRegEntryBB;
1695 OI.ExitBB = PRegExitBB;
1696
1697 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1699 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1700
1701 CodeExtractorAnalysisCache CEAC(*OuterFn);
1702 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1703 /* AggregateArgs */ false,
1704 /* BlockFrequencyInfo */ nullptr,
1705 /* BranchProbabilityInfo */ nullptr,
1706 /* AssumptionCache */ nullptr,
1707 /* AllowVarArgs */ true,
1708 /* AllowAlloca */ true,
1709 /* AllocationBlock */ OuterAllocaBlock,
1710 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1711
1712 // Find inputs to, outputs from the code region.
1713 BasicBlock *CommonExit = nullptr;
1714 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1715 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1716
1717 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1718 /*CollectGlobalInputs=*/true);
1719
1720 Inputs.remove_if([&](Value *I) {
1722 return GV->getValueType() == OpenMPIRBuilder::Ident;
1723
1724 return false;
1725 });
1726
1727 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1728
1729 FunctionCallee TIDRTLFn =
1730 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1731
1732 auto PrivHelper = [&](Value &V) -> Error {
1733 if (&V == TIDAddr || &V == ZeroAddr) {
1734 OI.ExcludeArgsFromAggregate.push_back(&V);
1735 return Error::success();
1736 }
1737
1739 for (Use &U : V.uses())
1740 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1741 if (ParallelRegionBlockSet.count(UserI->getParent()))
1742 Uses.insert(&U);
1743
1744 // __kmpc_fork_call expects extra arguments as pointers. If the input
1745 // already has a pointer type, everything is fine. Otherwise, store the
1746 // value onto stack and load it back inside the to-be-outlined region. This
1747 // will ensure only the pointer will be passed to the function.
1748 // FIXME: if there are more than 15 trailing arguments, they must be
1749 // additionally packed in a struct.
1750 Value *Inner = &V;
1751 if (!V.getType()->isPointerTy()) {
1752 IRBuilder<>::InsertPointGuard Guard(Builder);
1753 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1754
1755 Builder.restoreIP(OuterAllocaIP);
1756 Value *Ptr =
1757 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1758
1759 // Store to stack at end of the block that currently branches to the entry
1760 // block of the to-be-outlined region.
1761 Builder.SetInsertPoint(InsertBB,
1762 InsertBB->getTerminator()->getIterator());
1763 Builder.CreateStore(&V, Ptr);
1764
1765 // Load back next to allocations in the to-be-outlined region.
1766 Builder.restoreIP(InnerAllocaIP);
1767 Inner = Builder.CreateLoad(V.getType(), Ptr);
1768 }
1769
1770 Value *ReplacementValue = nullptr;
1771 CallInst *CI = dyn_cast<CallInst>(&V);
1772 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1773 ReplacementValue = PrivTID;
1774 } else {
1775 InsertPointOrErrorTy AfterIP =
1776 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1777 if (!AfterIP)
1778 return AfterIP.takeError();
1779 Builder.restoreIP(*AfterIP);
1780 InnerAllocaIP = {
1781 InnerAllocaIP.getBlock(),
1782 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1783
1784 assert(ReplacementValue &&
1785 "Expected copy/create callback to set replacement value!");
1786 if (ReplacementValue == &V)
1787 return Error::success();
1788 }
1789
1790 for (Use *UPtr : Uses)
1791 UPtr->set(ReplacementValue);
1792
1793 return Error::success();
1794 };
1795
1796 // Reset the inner alloca insertion as it will be used for loading the values
1797 // wrapped into pointers before passing them into the to-be-outlined region.
1798 // Configure it to insert immediately after the fake use of zero address so
1799 // that they are available in the generated body and so that the
1800 // OpenMP-related values (thread ID and zero address pointers) remain leading
1801 // in the argument list.
1802 InnerAllocaIP = IRBuilder<>::InsertPoint(
1803 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1804
1805 // Reset the outer alloca insertion point to the entry of the relevant block
1806 // in case it was invalidated.
1807 OuterAllocaIP = IRBuilder<>::InsertPoint(
1808 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1809
1810 for (Value *Input : Inputs) {
1811 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1812 if (Error Err = PrivHelper(*Input))
1813 return Err;
1814 }
1815 LLVM_DEBUG({
1816 for (Value *Output : Outputs)
1817 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1818 });
1819 assert(Outputs.empty() &&
1820 "OpenMP outlining should not produce live-out values!");
1821
1822 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1823 LLVM_DEBUG({
1824 for (auto *BB : Blocks)
1825 dbgs() << " PBR: " << BB->getName() << "\n";
1826 });
1827
1828 // Adjust the finalization stack, verify the adjustment, and call the
1829 // finalize function a last time to finalize values between the pre-fini
1830 // block and the exit block if we left the parallel "the normal way".
1831 auto FiniInfo = FinalizationStack.pop_back_val();
1832 (void)FiniInfo;
1833 assert(FiniInfo.DK == OMPD_parallel &&
1834 "Unexpected finalization stack state!");
1835
1836 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1837
1838 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1839 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1840 if (!FiniBBOrErr)
1841 return FiniBBOrErr.takeError();
1842 {
1843 IRBuilderBase::InsertPointGuard Guard(Builder);
1844 Builder.restoreIP(PreFiniIP);
1845 Builder.CreateBr(*FiniBBOrErr);
1846 // There's currently a branch to omp.par.exit. Delete it. We will get there
1847 // via the fini block
1848 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1849 Term->eraseFromParent();
1850 }
1851
1852 // Register the outlined info.
1853 addOutlineInfo(std::move(OI));
1854
1855 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1856 UI->eraseFromParent();
1857
1858 return AfterIP;
1859}
1860
1861void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1862 // Build call void __kmpc_flush(ident_t *loc)
1863 uint32_t SrcLocStrSize;
1864 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1865 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1866
1867 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush),
1868 Args);
1869}
1870
1871void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1872 if (!updateToLocation(Loc))
1873 return;
1874 emitFlush(Loc);
1875}
1876
1877void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1878 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1879 // global_tid);
1880 uint32_t SrcLocStrSize;
1881 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1882 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1883 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1884
1885 // Ignore return result until untied tasks are supported.
1886 createRuntimeFunctionCall(
1887 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1888}
1889
1890void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1891 if (!updateToLocation(Loc))
1892 return;
1893 emitTaskwaitImpl(Loc);
1894}
1895
1896void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1897 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1898 uint32_t SrcLocStrSize;
1899 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1900 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1901 Constant *I32Null = ConstantInt::getNullValue(Int32);
1902 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1903
1904 createRuntimeFunctionCall(
1905 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1906}
1907
1908void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1909 if (!updateToLocation(Loc))
1910 return;
1911 emitTaskyieldImpl(Loc);
1912}
1913
1914// Processes the dependencies in Dependencies and does the following
1915// - Allocates space on the stack of an array of DependInfo objects
1916// - Populates each DependInfo object with relevant information of
1917// the corresponding dependence.
1918// - All code is inserted in the entry block of the current function.
1920 OpenMPIRBuilder &OMPBuilder,
1922 // Early return if we have no dependencies to process
1923 if (Dependencies.empty())
1924 return nullptr;
1925
1926 // Given a vector of DependData objects, in this function we create an
1927 // array on the stack that holds kmp_dep_info objects corresponding
1928 // to each dependency. This is then passed to the OpenMP runtime.
1929 // For example, if there are 'n' dependencies then the following psedo
1930 // code is generated. Assume the first dependence is on a variable 'a'
1931 //
1932 // \code{c}
1933 // DepArray = alloc(n x sizeof(kmp_depend_info);
1934 // idx = 0;
1935 // DepArray[idx].base_addr = ptrtoint(&a);
1936 // DepArray[idx].len = 8;
1937 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1938 // ++idx;
1939 // DepArray[idx].base_addr = ...;
1940 // \endcode
1941
1942 IRBuilderBase &Builder = OMPBuilder.Builder;
1943 Type *DependInfo = OMPBuilder.DependInfo;
1944 Module &M = OMPBuilder.M;
1945
1946 Value *DepArray = nullptr;
1947 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1948 Builder.SetInsertPoint(
1949 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1950
1951 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1952 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1953
1954 Builder.restoreIP(OldIP);
1955
1956 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1957 Value *Base =
1958 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1959 // Store the pointer to the variable
1960 Value *Addr = Builder.CreateStructGEP(
1961 DependInfo, Base,
1962 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1963 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1964 Builder.CreateStore(DepValPtr, Addr);
1965 // Store the size of the variable
1966 Value *Size = Builder.CreateStructGEP(
1967 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1968 Builder.CreateStore(
1969 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1970 Size);
1971 // Store the dependency kind
1972 Value *Flags = Builder.CreateStructGEP(
1973 DependInfo, Base,
1974 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1975 Builder.CreateStore(
1976 ConstantInt::get(Builder.getInt8Ty(),
1977 static_cast<unsigned int>(Dep.DepKind)),
1978 Flags);
1979 }
1980 return DepArray;
1981}
1982
1983OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1984 const LocationDescription &Loc, InsertPointTy AllocaIP,
1985 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1986 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1987 Value *Priority) {
1988
1989 if (!updateToLocation(Loc))
1990 return InsertPointTy();
1991
1992 uint32_t SrcLocStrSize;
1993 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1994 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1995 // The current basic block is split into four basic blocks. After outlining,
1996 // they will be mapped as follows:
1997 // ```
1998 // def current_fn() {
1999 // current_basic_block:
2000 // br label %task.exit
2001 // task.exit:
2002 // ; instructions after task
2003 // }
2004 // def outlined_fn() {
2005 // task.alloca:
2006 // br label %task.body
2007 // task.body:
2008 // ret void
2009 // }
2010 // ```
2011 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2012 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2013 BasicBlock *TaskAllocaBB =
2014 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2015
2016 InsertPointTy TaskAllocaIP =
2017 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2018 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2019 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2020 return Err;
2021
2022 OutlineInfo OI;
2023 OI.EntryBB = TaskAllocaBB;
2024 OI.OuterAllocaBB = AllocaIP.getBlock();
2025 OI.ExitBB = TaskExitBB;
2026
2027 // Add the thread ID argument.
2029 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2030 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2031
2032 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2033 Mergeable, Priority, EventHandle, TaskAllocaBB,
2034 ToBeDeleted](Function &OutlinedFn) mutable {
2035 // Replace the Stale CI by appropriate RTL function call.
2036 assert(OutlinedFn.hasOneUse() &&
2037 "there must be a single user for the outlined function");
2038 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2039
2040 // HasShareds is true if any variables are captured in the outlined region,
2041 // false otherwise.
2042 bool HasShareds = StaleCI->arg_size() > 1;
2043 Builder.SetInsertPoint(StaleCI);
2044
2045 // Gather the arguments for emitting the runtime call for
2046 // @__kmpc_omp_task_alloc
2047 Function *TaskAllocFn =
2048 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2049
2050 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2051 // call.
2052 Value *ThreadID = getOrCreateThreadID(Ident);
2053
2054 // Argument - `flags`
2055 // Task is tied iff (Flags & 1) == 1.
2056 // Task is untied iff (Flags & 1) == 0.
2057 // Task is final iff (Flags & 2) == 2.
2058 // Task is not final iff (Flags & 2) == 0.
2059 // Task is mergeable iff (Flags & 4) == 4.
2060 // Task is not mergeable iff (Flags & 4) == 0.
2061 // Task is priority iff (Flags & 32) == 32.
2062 // Task is not priority iff (Flags & 32) == 0.
2063 // TODO: Handle the other flags.
2064 Value *Flags = Builder.getInt32(Tied);
2065 if (Final) {
2066 Value *FinalFlag =
2067 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2068 Flags = Builder.CreateOr(FinalFlag, Flags);
2069 }
2070
2071 if (Mergeable)
2072 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2073 if (Priority)
2074 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2075
2076 // Argument - `sizeof_kmp_task_t` (TaskSize)
2077 // Tasksize refers to the size in bytes of kmp_task_t data structure
2078 // including private vars accessed in task.
2079 // TODO: add kmp_task_t_with_privates (privates)
2080 Value *TaskSize = Builder.getInt64(
2081 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2082
2083 // Argument - `sizeof_shareds` (SharedsSize)
2084 // SharedsSize refers to the shareds array size in the kmp_task_t data
2085 // structure.
2086 Value *SharedsSize = Builder.getInt64(0);
2087 if (HasShareds) {
2088 AllocaInst *ArgStructAlloca =
2090 assert(ArgStructAlloca &&
2091 "Unable to find the alloca instruction corresponding to arguments "
2092 "for extracted function");
2093 StructType *ArgStructType =
2094 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2095 assert(ArgStructType && "Unable to find struct type corresponding to "
2096 "arguments for extracted function");
2097 SharedsSize =
2098 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2099 }
2100 // Emit the @__kmpc_omp_task_alloc runtime call
2101 // The runtime call returns a pointer to an area where the task captured
2102 // variables must be copied before the task is run (TaskData)
2103 CallInst *TaskData = createRuntimeFunctionCall(
2104 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2105 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2106 /*task_func=*/&OutlinedFn});
2107
2108 // Emit detach clause initialization.
2109 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2110 // task_descriptor);
2111 if (EventHandle) {
2112 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2113 OMPRTL___kmpc_task_allow_completion_event);
2114 llvm::Value *EventVal =
2115 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2116 llvm::Value *EventHandleAddr =
2117 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2118 Builder.getPtrTy(0));
2119 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2120 Builder.CreateStore(EventVal, EventHandleAddr);
2121 }
2122 // Copy the arguments for outlined function
2123 if (HasShareds) {
2124 Value *Shareds = StaleCI->getArgOperand(1);
2125 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2126 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2127 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2128 SharedsSize);
2129 }
2130
2131 if (Priority) {
2132 //
2133 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2134 // we populate the priority information into the "kmp_task_t" here
2135 //
2136 // The struct "kmp_task_t" definition is available in kmp.h
2137 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2138 // data2 is used for priority
2139 //
2140 Type *Int32Ty = Builder.getInt32Ty();
2141 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2142 // kmp_task_t* => { ptr }
2143 Type *TaskPtr = StructType::get(VoidPtr);
2144 Value *TaskGEP =
2145 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2146 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2147 Type *TaskStructType = StructType::get(
2148 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2149 Value *PriorityData = Builder.CreateInBoundsGEP(
2150 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2151 // kmp_cmplrdata_t => { ptr, ptr }
2152 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2153 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2154 PriorityData, {Zero, Zero});
2155 Builder.CreateStore(Priority, CmplrData);
2156 }
2157
2158 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2159
2160 // In the presence of the `if` clause, the following IR is generated:
2161 // ...
2162 // %data = call @__kmpc_omp_task_alloc(...)
2163 // br i1 %if_condition, label %then, label %else
2164 // then:
2165 // call @__kmpc_omp_task(...)
2166 // br label %exit
2167 // else:
2168 // ;; Wait for resolution of dependencies, if any, before
2169 // ;; beginning the task
2170 // call @__kmpc_omp_wait_deps(...)
2171 // call @__kmpc_omp_task_begin_if0(...)
2172 // call @outlined_fn(...)
2173 // call @__kmpc_omp_task_complete_if0(...)
2174 // br label %exit
2175 // exit:
2176 // ...
2177 if (IfCondition) {
2178 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2179 // terminator.
2180 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2181 Instruction *IfTerminator =
2182 Builder.GetInsertPoint()->getParent()->getTerminator();
2183 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2184 Builder.SetInsertPoint(IfTerminator);
2185 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2186 &ElseTI);
2187 Builder.SetInsertPoint(ElseTI);
2188
2189 if (Dependencies.size()) {
2190 Function *TaskWaitFn =
2191 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2192 createRuntimeFunctionCall(
2193 TaskWaitFn,
2194 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2195 ConstantInt::get(Builder.getInt32Ty(), 0),
2197 }
2198 Function *TaskBeginFn =
2199 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2200 Function *TaskCompleteFn =
2201 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2202 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2203 CallInst *CI = nullptr;
2204 if (HasShareds)
2205 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2206 else
2207 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2208 CI->setDebugLoc(StaleCI->getDebugLoc());
2209 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2210 Builder.SetInsertPoint(ThenTI);
2211 }
2212
2213 if (Dependencies.size()) {
2214 Function *TaskFn =
2215 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2216 createRuntimeFunctionCall(
2217 TaskFn,
2218 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2219 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2221
2222 } else {
2223 // Emit the @__kmpc_omp_task runtime call to spawn the task
2224 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2225 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2226 }
2227
2228 StaleCI->eraseFromParent();
2229
2230 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2231 if (HasShareds) {
2232 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2233 OutlinedFn.getArg(1)->replaceUsesWithIf(
2234 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2235 }
2236
2237 for (Instruction *I : llvm::reverse(ToBeDeleted))
2238 I->eraseFromParent();
2239 };
2240
2241 addOutlineInfo(std::move(OI));
2242 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2243
2244 return Builder.saveIP();
2245}
2246
2247OpenMPIRBuilder::InsertPointOrErrorTy
2248OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2249 InsertPointTy AllocaIP,
2250 BodyGenCallbackTy BodyGenCB) {
2251 if (!updateToLocation(Loc))
2252 return InsertPointTy();
2253
2254 uint32_t SrcLocStrSize;
2255 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2256 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2257 Value *ThreadID = getOrCreateThreadID(Ident);
2258
2259 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2260 Function *TaskgroupFn =
2261 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2262 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2263
2264 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2265 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2266 return Err;
2267
2268 Builder.SetInsertPoint(TaskgroupExitBB);
2269 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2270 Function *EndTaskgroupFn =
2271 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2272 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2273
2274 return Builder.saveIP();
2275}
2276
2277OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2278 const LocationDescription &Loc, InsertPointTy AllocaIP,
2279 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2280 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2281 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2282
2283 if (!updateToLocation(Loc))
2284 return Loc.IP;
2285
2286 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2287
2288 // Each section is emitted as a switch case
2289 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2290 // -> OMP.createSection() which generates the IR for each section
2291 // Iterate through all sections and emit a switch construct:
2292 // switch (IV) {
2293 // case 0:
2294 // <SectionStmt[0]>;
2295 // break;
2296 // ...
2297 // case <NumSection> - 1:
2298 // <SectionStmt[<NumSection> - 1]>;
2299 // break;
2300 // }
2301 // ...
2302 // section_loop.after:
2303 // <FiniCB>;
2304 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2305 Builder.restoreIP(CodeGenIP);
2307 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2308 Function *CurFn = Continue->getParent();
2309 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2310
2311 unsigned CaseNumber = 0;
2312 for (auto SectionCB : SectionCBs) {
2314 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2315 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2316 Builder.SetInsertPoint(CaseBB);
2317 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2318 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2319 CaseEndBr->getIterator()}))
2320 return Err;
2321 CaseNumber++;
2322 }
2323 // remove the existing terminator from body BB since there can be no
2324 // terminators after switch/case
2325 return Error::success();
2326 };
2327 // Loop body ends here
2328 // LowerBound, UpperBound, and STride for createCanonicalLoop
2329 Type *I32Ty = Type::getInt32Ty(M.getContext());
2330 Value *LB = ConstantInt::get(I32Ty, 0);
2331 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2332 Value *ST = ConstantInt::get(I32Ty, 1);
2333 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2334 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2335 if (!LoopInfo)
2336 return LoopInfo.takeError();
2337
2338 InsertPointOrErrorTy WsloopIP =
2339 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2340 WorksharingLoopType::ForStaticLoop, !IsNowait);
2341 if (!WsloopIP)
2342 return WsloopIP.takeError();
2343 InsertPointTy AfterIP = *WsloopIP;
2344
2345 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2346 assert(LoopFini && "Bad structure of static workshare loop finalization");
2347
2348 // Apply the finalization callback in LoopAfterBB
2349 auto FiniInfo = FinalizationStack.pop_back_val();
2350 assert(FiniInfo.DK == OMPD_sections &&
2351 "Unexpected finalization stack state!");
2352 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2353 return Err;
2354
2355 return AfterIP;
2356}
2357
2358OpenMPIRBuilder::InsertPointOrErrorTy
2359OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2360 BodyGenCallbackTy BodyGenCB,
2361 FinalizeCallbackTy FiniCB) {
2362 if (!updateToLocation(Loc))
2363 return Loc.IP;
2364
2365 auto FiniCBWrapper = [&](InsertPointTy IP) {
2366 if (IP.getBlock()->end() != IP.getPoint())
2367 return FiniCB(IP);
2368 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2369 // will fail because that function requires the Finalization Basic Block to
2370 // have a terminator, which is already removed by EmitOMPRegionBody.
2371 // IP is currently at cancelation block.
2372 // We need to backtrack to the condition block to fetch
2373 // the exit block and create a branch from cancelation
2374 // to exit block.
2375 IRBuilder<>::InsertPointGuard IPG(Builder);
2376 Builder.restoreIP(IP);
2377 auto *CaseBB = Loc.IP.getBlock();
2378 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2379 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2380 Instruction *I = Builder.CreateBr(ExitBB);
2381 IP = InsertPointTy(I->getParent(), I->getIterator());
2382 return FiniCB(IP);
2383 };
2384
2385 Directive OMPD = Directive::OMPD_sections;
2386 // Since we are using Finalization Callback here, HasFinalize
2387 // and IsCancellable have to be true
2388 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2389 /*Conditional*/ false, /*hasFinalize*/ true,
2390 /*IsCancellable*/ true);
2391}
2392
2393static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2395 IT++;
2396 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2397}
2398
2399Value *OpenMPIRBuilder::getGPUThreadID() {
2400 return createRuntimeFunctionCall(
2401 getOrCreateRuntimeFunction(M,
2402 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2403 {});
2404}
2405
2406Value *OpenMPIRBuilder::getGPUWarpSize() {
2407 return createRuntimeFunctionCall(
2408 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2409}
2410
2411Value *OpenMPIRBuilder::getNVPTXWarpID() {
2412 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2413 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2414}
2415
2416Value *OpenMPIRBuilder::getNVPTXLaneID() {
2417 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2418 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2419 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2420 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2421 "nvptx_lane_id");
2422}
2423
2424Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2425 Type *ToType) {
2426 Type *FromType = From->getType();
2427 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2428 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2429 assert(FromSize > 0 && "From size must be greater than zero");
2430 assert(ToSize > 0 && "To size must be greater than zero");
2431 if (FromType == ToType)
2432 return From;
2433 if (FromSize == ToSize)
2434 return Builder.CreateBitCast(From, ToType);
2435 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2436 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2437 InsertPointTy SaveIP = Builder.saveIP();
2438 Builder.restoreIP(AllocaIP);
2439 Value *CastItem = Builder.CreateAlloca(ToType);
2440 Builder.restoreIP(SaveIP);
2441
2442 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2443 CastItem, Builder.getPtrTy(0));
2444 Builder.CreateStore(From, ValCastItem);
2445 return Builder.CreateLoad(ToType, CastItem);
2446}
2447
2448Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2449 Value *Element,
2450 Type *ElementType,
2451 Value *Offset) {
2452 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2453 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2454
2455 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2456 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2457 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2458 Value *WarpSize =
2459 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2460 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2461 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2462 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2463 Value *WarpSizeCast =
2464 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2465 Value *ShuffleCall =
2466 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2467 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2468}
2469
2470void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2471 Value *DstAddr, Type *ElemType,
2472 Value *Offset, Type *ReductionArrayTy,
2473 bool IsByRefElem) {
2474 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2475 // Create the loop over the big sized data.
2476 // ptr = (void*)Elem;
2477 // ptrEnd = (void*) Elem + 1;
2478 // Step = 8;
2479 // while (ptr + Step < ptrEnd)
2480 // shuffle((int64_t)*ptr);
2481 // Step = 4;
2482 // while (ptr + Step < ptrEnd)
2483 // shuffle((int32_t)*ptr);
2484 // ...
2485 Type *IndexTy = Builder.getIndexTy(
2486 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2487 Value *ElemPtr = DstAddr;
2488 Value *Ptr = SrcAddr;
2489 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2490 if (Size < IntSize)
2491 continue;
2492 Type *IntType = Builder.getIntNTy(IntSize * 8);
2493 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2494 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2495 Value *SrcAddrGEP =
2496 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2497 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2498 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2499
2500 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2501 if ((Size / IntSize) > 1) {
2502 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2503 SrcAddrGEP, Builder.getPtrTy());
2504 BasicBlock *PreCondBB =
2505 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2506 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2507 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2508 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2509 emitBlock(PreCondBB, CurFunc);
2510 PHINode *PhiSrc =
2511 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2512 PhiSrc->addIncoming(Ptr, CurrentBB);
2513 PHINode *PhiDest =
2514 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2515 PhiDest->addIncoming(ElemPtr, CurrentBB);
2516 Ptr = PhiSrc;
2517 ElemPtr = PhiDest;
2518 Value *PtrDiff = Builder.CreatePtrDiff(
2519 Builder.getInt8Ty(), PtrEnd,
2520 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2521 Builder.CreateCondBr(
2522 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2523 ExitBB);
2524 emitBlock(ThenBB, CurFunc);
2525 Value *Res = createRuntimeShuffleFunction(
2526 AllocaIP,
2527 Builder.CreateAlignedLoad(
2528 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2529 IntType, Offset);
2530 Builder.CreateAlignedStore(Res, ElemPtr,
2531 M.getDataLayout().getPrefTypeAlign(ElemType));
2532 Value *LocalPtr =
2533 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2534 Value *LocalElemPtr =
2535 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2536 PhiSrc->addIncoming(LocalPtr, ThenBB);
2537 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2538 emitBranch(PreCondBB);
2539 emitBlock(ExitBB, CurFunc);
2540 } else {
2541 Value *Res = createRuntimeShuffleFunction(
2542 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2543 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2544 Res->getType()->getScalarSizeInBits())
2545 Res = Builder.CreateTrunc(Res, ElemType);
2546 Builder.CreateStore(Res, ElemPtr);
2547 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2548 ElemPtr =
2549 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2550 }
2551 Size = Size % IntSize;
2552 }
2553}
2554
2555Error OpenMPIRBuilder::emitReductionListCopy(
2556 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2557 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2558 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
2559 Type *IndexTy = Builder.getIndexTy(
2560 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2561 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2562
2563 // Iterates, element-by-element, through the source Reduce list and
2564 // make a copy.
2565 for (auto En : enumerate(ReductionInfos)) {
2566 const ReductionInfo &RI = En.value();
2567 Value *SrcElementAddr = nullptr;
2568 AllocaInst *DestAlloca = nullptr;
2569 Value *DestElementAddr = nullptr;
2570 Value *DestElementPtrAddr = nullptr;
2571 // Should we shuffle in an element from a remote lane?
2572 bool ShuffleInElement = false;
2573 // Set to true to update the pointer in the dest Reduce list to a
2574 // newly created element.
2575 bool UpdateDestListPtr = false;
2576
2577 // Step 1.1: Get the address for the src element in the Reduce list.
2578 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2579 ReductionArrayTy, SrcBase,
2580 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2581 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2582
2583 // Step 1.2: Create a temporary to store the element in the destination
2584 // Reduce list.
2585 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2586 ReductionArrayTy, DestBase,
2587 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2588 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
2589 switch (Action) {
2590 case CopyAction::RemoteLaneToThread: {
2591 InsertPointTy CurIP = Builder.saveIP();
2592 Builder.restoreIP(AllocaIP);
2593
2594 Type *DestAllocaType =
2595 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
2596 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
2597 ".omp.reduction.element");
2598 DestAlloca->setAlignment(
2599 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
2600 DestElementAddr = DestAlloca;
2601 DestElementAddr =
2602 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2603 DestElementAddr->getName() + ".ascast");
2604 Builder.restoreIP(CurIP);
2605 ShuffleInElement = true;
2606 UpdateDestListPtr = true;
2607 break;
2608 }
2609 case CopyAction::ThreadCopy: {
2610 DestElementAddr =
2611 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2612 break;
2613 }
2614 }
2615
2616 // Now that all active lanes have read the element in the
2617 // Reduce list, shuffle over the value from the remote lane.
2618 if (ShuffleInElement) {
2619 Type *ShuffleType = RI.ElementType;
2620 Value *ShuffleSrcAddr = SrcElementAddr;
2621 Value *ShuffleDestAddr = DestElementAddr;
2622 AllocaInst *LocalStorage = nullptr;
2623
2624 if (IsByRefElem) {
2625 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
2626 assert(RI.ByRefAllocatedType &&
2627 "Expected by-ref allocated type to be set");
2628 // For by-ref reductions, we need to copy from the remote lane the
2629 // actual value of the partial reduction computed by that remote lane;
2630 // rather than, for example, a pointer to that data or, even worse, a
2631 // pointer to the descriptor of the by-ref reduction element.
2632 ShuffleType = RI.ByRefElementType;
2633
2634 InsertPointOrErrorTy GenResult =
2635 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
2636
2637 if (!GenResult)
2638 return GenResult.takeError();
2639
2640 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
2641
2642 {
2643 InsertPointTy OldIP = Builder.saveIP();
2644 Builder.restoreIP(AllocaIP);
2645
2646 LocalStorage = Builder.CreateAlloca(ShuffleType);
2647 Builder.restoreIP(OldIP);
2648 ShuffleDestAddr = LocalStorage;
2649 }
2650 }
2651
2652 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
2653 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
2654
2655 if (IsByRefElem) {
2656 Value *GEP;
2657 InsertPointOrErrorTy GenResult =
2658 RI.DataPtrPtrGen(Builder.saveIP(),
2659 Builder.CreatePointerBitCastOrAddrSpaceCast(
2660 DestAlloca, Builder.getPtrTy(), ".ascast"),
2661 GEP);
2662
2663 if (!GenResult)
2664 return GenResult.takeError();
2665
2666 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
2667 LocalStorage, Builder.getPtrTy(), ".ascast"),
2668 GEP);
2669 }
2670 } else {
2671 switch (RI.EvaluationKind) {
2672 case EvalKind::Scalar: {
2673 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2674 // Store the source element value to the dest element address.
2675 Builder.CreateStore(Elem, DestElementAddr);
2676 break;
2677 }
2678 case EvalKind::Complex: {
2679 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2680 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2681 Value *SrcReal = Builder.CreateLoad(
2682 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2683 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2684 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2685 Value *SrcImg = Builder.CreateLoad(
2686 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2687
2688 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2689 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2690 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2691 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2692 Builder.CreateStore(SrcReal, DestRealPtr);
2693 Builder.CreateStore(SrcImg, DestImgPtr);
2694 break;
2695 }
2696 case EvalKind::Aggregate: {
2697 Value *SizeVal = Builder.getInt64(
2698 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2699 Builder.CreateMemCpy(
2700 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2701 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2702 SizeVal, false);
2703 break;
2704 }
2705 };
2706 }
2707
2708 // Step 3.1: Modify reference in dest Reduce list as needed.
2709 // Modifying the reference in Reduce list to point to the newly
2710 // created element. The element is live in the current function
2711 // scope and that of functions it invokes (i.e., reduce_function).
2712 // RemoteReduceData[i] = (void*)&RemoteElem
2713 if (UpdateDestListPtr) {
2714 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2715 DestElementAddr, Builder.getPtrTy(),
2716 DestElementAddr->getName() + ".ascast");
2717 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2718 }
2719 }
2720
2721 return Error::success();
2722}
2723
2724Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2725 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2726 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
2727 InsertPointTy SavedIP = Builder.saveIP();
2728 LLVMContext &Ctx = M.getContext();
2730 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2731 /* IsVarArg */ false);
2732 Function *WcFunc =
2734 "_omp_reduction_inter_warp_copy_func", &M);
2735 WcFunc->setAttributes(FuncAttrs);
2736 WcFunc->addParamAttr(0, Attribute::NoUndef);
2737 WcFunc->addParamAttr(1, Attribute::NoUndef);
2738 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2739 Builder.SetInsertPoint(EntryBB);
2740
2741 // ReduceList: thread local Reduce list.
2742 // At the stage of the computation when this function is called, partially
2743 // aggregated values reside in the first lane of every active warp.
2744 Argument *ReduceListArg = WcFunc->getArg(0);
2745 // NumWarps: number of warps active in the parallel region. This could
2746 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2747 Argument *NumWarpsArg = WcFunc->getArg(1);
2748
2749 // This array is used as a medium to transfer, one reduce element at a time,
2750 // the data from the first lane of every warp to lanes in the first warp
2751 // in order to perform the final step of a reduction in a parallel region
2752 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2753 // for reduced latency, as well as to have a distinct copy for concurrently
2754 // executing target regions. The array is declared with common linkage so
2755 // as to be shared across compilation units.
2756 StringRef TransferMediumName =
2757 "__openmp_nvptx_data_transfer_temporary_storage";
2758 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2759 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2760 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2761 if (!TransferMedium) {
2762 TransferMedium = new GlobalVariable(
2763 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2764 UndefValue::get(ArrayTy), TransferMediumName,
2765 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2766 /*AddressSpace=*/3);
2767 }
2768
2769 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2770 Value *GPUThreadID = getGPUThreadID();
2771 // nvptx_lane_id = nvptx_id % warpsize
2772 Value *LaneID = getNVPTXLaneID();
2773 // nvptx_warp_id = nvptx_id / warpsize
2774 Value *WarpID = getNVPTXWarpID();
2775
2776 InsertPointTy AllocaIP =
2777 InsertPointTy(Builder.GetInsertBlock(),
2778 Builder.GetInsertBlock()->getFirstInsertionPt());
2779 Type *Arg0Type = ReduceListArg->getType();
2780 Type *Arg1Type = NumWarpsArg->getType();
2781 Builder.restoreIP(AllocaIP);
2782 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2783 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2784 AllocaInst *NumWarpsAlloca =
2785 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2786 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2787 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2788 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2789 NumWarpsAlloca, Builder.getPtrTy(0),
2790 NumWarpsAlloca->getName() + ".ascast");
2791 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2792 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2793 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2794 InsertPointTy CodeGenIP =
2795 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2796 Builder.restoreIP(CodeGenIP);
2797
2798 Value *ReduceList =
2799 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2800
2801 for (auto En : enumerate(ReductionInfos)) {
2802 //
2803 // Warp master copies reduce element to transfer medium in __shared__
2804 // memory.
2805 //
2806 const ReductionInfo &RI = En.value();
2807 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
2808 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
2809 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
2810 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2811 Type *CType = Builder.getIntNTy(TySize * 8);
2812
2813 unsigned NumIters = RealTySize / TySize;
2814 if (NumIters == 0)
2815 continue;
2816 Value *Cnt = nullptr;
2817 Value *CntAddr = nullptr;
2818 BasicBlock *PrecondBB = nullptr;
2819 BasicBlock *ExitBB = nullptr;
2820 if (NumIters > 1) {
2821 CodeGenIP = Builder.saveIP();
2822 Builder.restoreIP(AllocaIP);
2823 CntAddr =
2824 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2825
2826 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2827 CntAddr->getName() + ".ascast");
2828 Builder.restoreIP(CodeGenIP);
2829 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2830 CntAddr,
2831 /*Volatile=*/false);
2832 PrecondBB = BasicBlock::Create(Ctx, "precond");
2833 ExitBB = BasicBlock::Create(Ctx, "exit");
2834 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2835 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2836 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2837 /*Volatile=*/false);
2838 Value *Cmp = Builder.CreateICmpULT(
2839 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2840 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2841 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2842 }
2843
2844 // kmpc_barrier.
2845 InsertPointOrErrorTy BarrierIP1 =
2846 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2847 omp::Directive::OMPD_unknown,
2848 /* ForceSimpleCall */ false,
2849 /* CheckCancelFlag */ true);
2850 if (!BarrierIP1)
2851 return BarrierIP1.takeError();
2852 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2853 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2854 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2855
2856 // if (lane_id == 0)
2857 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2858 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2859 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2860
2861 // Reduce element = LocalReduceList[i]
2862 auto *RedListArrayTy =
2863 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2864 Type *IndexTy = Builder.getIndexTy(
2865 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2866 Value *ElemPtrPtr =
2867 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2868 {ConstantInt::get(IndexTy, 0),
2869 ConstantInt::get(IndexTy, En.index())});
2870 // elemptr = ((CopyType*)(elemptrptr)) + I
2871 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2872
2873 if (IsByRefElem) {
2874 InsertPointOrErrorTy GenRes =
2875 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
2876
2877 if (!GenRes)
2878 return GenRes.takeError();
2879
2880 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
2881 }
2882
2883 if (NumIters > 1)
2884 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2885
2886 // Get pointer to location in transfer medium.
2887 // MediumPtr = &medium[warp_id]
2888 Value *MediumPtr = Builder.CreateInBoundsGEP(
2889 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2890 // elem = *elemptr
2891 //*MediumPtr = elem
2892 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2893 // Store the source element value to the dest element address.
2894 Builder.CreateStore(Elem, MediumPtr,
2895 /*IsVolatile*/ true);
2896 Builder.CreateBr(MergeBB);
2897
2898 // else
2899 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2900 Builder.CreateBr(MergeBB);
2901
2902 // endif
2903 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2904 InsertPointOrErrorTy BarrierIP2 =
2905 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2906 omp::Directive::OMPD_unknown,
2907 /* ForceSimpleCall */ false,
2908 /* CheckCancelFlag */ true);
2909 if (!BarrierIP2)
2910 return BarrierIP2.takeError();
2911
2912 // Warp 0 copies reduce element from transfer medium
2913 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2914 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2915 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2916
2917 Value *NumWarpsVal =
2918 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2919 // Up to 32 threads in warp 0 are active.
2920 Value *IsActiveThread =
2921 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2922 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2923
2924 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2925
2926 // SecMediumPtr = &medium[tid]
2927 // SrcMediumVal = *SrcMediumPtr
2928 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2929 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2930 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2931 Value *TargetElemPtrPtr =
2932 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2933 {ConstantInt::get(IndexTy, 0),
2934 ConstantInt::get(IndexTy, En.index())});
2935 Value *TargetElemPtrVal =
2936 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2937 Value *TargetElemPtr = TargetElemPtrVal;
2938
2939 if (IsByRefElem) {
2940 InsertPointOrErrorTy GenRes =
2941 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
2942
2943 if (!GenRes)
2944 return GenRes.takeError();
2945
2946 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
2947 }
2948
2949 if (NumIters > 1)
2950 TargetElemPtr =
2951 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2952
2953 // *TargetElemPtr = SrcMediumVal;
2954 Value *SrcMediumValue =
2955 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2956 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2957 Builder.CreateBr(W0MergeBB);
2958
2959 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2960 Builder.CreateBr(W0MergeBB);
2961
2962 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2963
2964 if (NumIters > 1) {
2965 Cnt = Builder.CreateNSWAdd(
2966 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2967 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2968
2969 auto *CurFn = Builder.GetInsertBlock()->getParent();
2970 emitBranch(PrecondBB);
2971 emitBlock(ExitBB, CurFn);
2972 }
2973 RealTySize %= TySize;
2974 }
2975 }
2976
2977 Builder.CreateRetVoid();
2978 Builder.restoreIP(SavedIP);
2979
2980 return WcFunc;
2981}
2982
2983Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
2984 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2985 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
2986 LLVMContext &Ctx = M.getContext();
2987 FunctionType *FuncTy =
2988 FunctionType::get(Builder.getVoidTy(),
2989 {Builder.getPtrTy(), Builder.getInt16Ty(),
2990 Builder.getInt16Ty(), Builder.getInt16Ty()},
2991 /* IsVarArg */ false);
2992 Function *SarFunc =
2994 "_omp_reduction_shuffle_and_reduce_func", &M);
2995 SarFunc->setAttributes(FuncAttrs);
2996 SarFunc->addParamAttr(0, Attribute::NoUndef);
2997 SarFunc->addParamAttr(1, Attribute::NoUndef);
2998 SarFunc->addParamAttr(2, Attribute::NoUndef);
2999 SarFunc->addParamAttr(3, Attribute::NoUndef);
3000 SarFunc->addParamAttr(1, Attribute::SExt);
3001 SarFunc->addParamAttr(2, Attribute::SExt);
3002 SarFunc->addParamAttr(3, Attribute::SExt);
3003 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3004 Builder.SetInsertPoint(EntryBB);
3005
3006 // Thread local Reduce list used to host the values of data to be reduced.
3007 Argument *ReduceListArg = SarFunc->getArg(0);
3008 // Current lane id; could be logical.
3009 Argument *LaneIDArg = SarFunc->getArg(1);
3010 // Offset of the remote source lane relative to the current lane.
3011 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3012 // Algorithm version. This is expected to be known at compile time.
3013 Argument *AlgoVerArg = SarFunc->getArg(3);
3014
3015 Type *ReduceListArgType = ReduceListArg->getType();
3016 Type *LaneIDArgType = LaneIDArg->getType();
3017 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3018 Value *ReduceListAlloca = Builder.CreateAlloca(
3019 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3020 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3021 LaneIDArg->getName() + ".addr");
3022 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3023 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3024 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3025 AlgoVerArg->getName() + ".addr");
3026 ArrayType *RedListArrayTy =
3027 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3028
3029 // Create a local thread-private variable to host the Reduce list
3030 // from a remote lane.
3031 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3032 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3033
3034 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3035 ReduceListAlloca, ReduceListArgType,
3036 ReduceListAlloca->getName() + ".ascast");
3037 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3038 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3039 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3040 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3041 RemoteLaneOffsetAlloca->getName() + ".ascast");
3042 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3043 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3044 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3045 RemoteReductionListAlloca, Builder.getPtrTy(),
3046 RemoteReductionListAlloca->getName() + ".ascast");
3047
3048 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3049 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3050 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3051 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3052
3053 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3054 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3055 Value *RemoteLaneOffset =
3056 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3057 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3058
3059 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3060
3061 // This loop iterates through the list of reduce elements and copies,
3062 // element by element, from a remote lane in the warp to RemoteReduceList,
3063 // hosted on the thread's stack.
3064 Error EmitRedLsCpRes = emitReductionListCopy(
3065 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3066 ReduceList, RemoteListAddrCast, IsByRef,
3067 {RemoteLaneOffset, nullptr, nullptr});
3068
3069 if (EmitRedLsCpRes)
3070 return EmitRedLsCpRes;
3071
3072 // The actions to be performed on the Remote Reduce list is dependent
3073 // on the algorithm version.
3074 //
3075 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3076 // LaneId % 2 == 0 && Offset > 0):
3077 // do the reduction value aggregation
3078 //
3079 // The thread local variable Reduce list is mutated in place to host the
3080 // reduced data, which is the aggregated value produced from local and
3081 // remote lanes.
3082 //
3083 // Note that AlgoVer is expected to be a constant integer known at compile
3084 // time.
3085 // When AlgoVer==0, the first conjunction evaluates to true, making
3086 // the entire predicate true during compile time.
3087 // When AlgoVer==1, the second conjunction has only the second part to be
3088 // evaluated during runtime. Other conjunctions evaluates to false
3089 // during compile time.
3090 // When AlgoVer==2, the third conjunction has only the second part to be
3091 // evaluated during runtime. Other conjunctions evaluates to false
3092 // during compile time.
3093 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3094 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3095 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3096 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3097 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3098 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3099 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3100 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3101 Value *RemoteOffsetComp =
3102 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3103 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3104 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3105 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3106
3107 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3108 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3109 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3110
3111 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3112 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3113 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3114 ReduceList, Builder.getPtrTy());
3115 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3116 RemoteListAddrCast, Builder.getPtrTy());
3117 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3118 ->addFnAttr(Attribute::NoUnwind);
3119 Builder.CreateBr(MergeBB);
3120
3121 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3122 Builder.CreateBr(MergeBB);
3123
3124 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3125
3126 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3127 // Reduce list.
3128 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3129 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3130 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3131
3132 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3133 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3134 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3135 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3136
3137 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3138
3139 EmitRedLsCpRes = emitReductionListCopy(
3140 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3141 RemoteListAddrCast, ReduceList, IsByRef);
3142
3143 if (EmitRedLsCpRes)
3144 return EmitRedLsCpRes;
3145
3146 Builder.CreateBr(CpyMergeBB);
3147
3148 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3149 Builder.CreateBr(CpyMergeBB);
3150
3151 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3152
3153 Builder.CreateRetVoid();
3154
3155 return SarFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3160 AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGCFunc =
3169 "_omp_reduction_list_to_global_copy_func", &M);
3170 LtGCFunc->setAttributes(FuncAttrs);
3171 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGCFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGCFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGCFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3192 BufferArgAlloca, Builder.getPtrTy(),
3193 BufferArgAlloca->getName() + ".ascast");
3194 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3195 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3196 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3197 ReduceListArgAlloca, Builder.getPtrTy(),
3198 ReduceListArgAlloca->getName() + ".ascast");
3199
3200 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3201 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3202 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3203
3204 Value *LocalReduceList =
3205 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3206 Value *BufferArgVal =
3207 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3208 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3209 Type *IndexTy = Builder.getIndexTy(
3210 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3211 for (auto En : enumerate(ReductionInfos)) {
3212 const ReductionInfo &RI = En.value();
3213 auto *RedListArrayTy =
3214 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3215 // Reduce element = LocalReduceList[i]
3216 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3217 RedListArrayTy, LocalReduceList,
3218 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3219 // elemptr = ((CopyType*)(elemptrptr)) + I
3220 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3221
3222 // Global = Buffer.VD[Idx];
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3225 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3226 ReductionsBufferTy, BufferVD, 0, En.index());
3227
3228 switch (RI.EvaluationKind) {
3229 case EvalKind::Scalar: {
3230 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3231 Builder.CreateStore(TargetElement, GlobVal);
3232 break;
3233 }
3234 case EvalKind::Complex: {
3235 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3236 RI.ElementType, ElemPtr, 0, 0, ".realp");
3237 Value *SrcReal = Builder.CreateLoad(
3238 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3239 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3240 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3241 Value *SrcImg = Builder.CreateLoad(
3242 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3243
3244 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3245 RI.ElementType, GlobVal, 0, 0, ".realp");
3246 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3247 RI.ElementType, GlobVal, 0, 1, ".imagp");
3248 Builder.CreateStore(SrcReal, DestRealPtr);
3249 Builder.CreateStore(SrcImg, DestImgPtr);
3250 break;
3251 }
3252 case EvalKind::Aggregate: {
3253 Value *SizeVal =
3254 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3255 Builder.CreateMemCpy(
3256 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3257 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3258 break;
3259 }
3260 }
3261 }
3262
3263 Builder.CreateRetVoid();
3264 Builder.restoreIP(OldIP);
3265 return LtGCFunc;
3266}
3267
3268Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3269 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3270 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3271 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3272 LLVMContext &Ctx = M.getContext();
3274 Builder.getVoidTy(),
3275 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3276 /* IsVarArg */ false);
3277 Function *LtGRFunc =
3279 "_omp_reduction_list_to_global_reduce_func", &M);
3280 LtGRFunc->setAttributes(FuncAttrs);
3281 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3282 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3283 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3284
3285 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3286 Builder.SetInsertPoint(EntryBlock);
3287
3288 // Buffer: global reduction buffer.
3289 Argument *BufferArg = LtGRFunc->getArg(0);
3290 // Idx: index of the buffer.
3291 Argument *IdxArg = LtGRFunc->getArg(1);
3292 // ReduceList: thread local Reduce list.
3293 Argument *ReduceListArg = LtGRFunc->getArg(2);
3294
3295 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3296 BufferArg->getName() + ".addr");
3297 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3298 IdxArg->getName() + ".addr");
3299 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3300 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3301 auto *RedListArrayTy =
3302 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3303
3304 // 1. Build a list of reduction variables.
3305 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3306 Value *LocalReduceList =
3307 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3308
3309 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3310 BufferArgAlloca, Builder.getPtrTy(),
3311 BufferArgAlloca->getName() + ".ascast");
3312 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3313 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3314 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3315 ReduceListArgAlloca, Builder.getPtrTy(),
3316 ReduceListArgAlloca->getName() + ".ascast");
3317 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3318 LocalReduceList, Builder.getPtrTy(),
3319 LocalReduceList->getName() + ".ascast");
3320
3321 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3322 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3323 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3324
3325 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3326 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3327 Type *IndexTy = Builder.getIndexTy(
3328 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3329 for (auto En : enumerate(ReductionInfos)) {
3330 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3331 RedListArrayTy, LocalReduceListAddrCast,
3332 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3333 Value *BufferVD =
3334 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3335 // Global = Buffer.VD[Idx];
3336 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3337 ReductionsBufferTy, BufferVD, 0, En.index());
3338 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3339 }
3340
3341 // Call reduce_function(GlobalReduceList, ReduceList)
3342 Value *ReduceList =
3343 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3344 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3345 ->addFnAttr(Attribute::NoUnwind);
3346 Builder.CreateRetVoid();
3347 Builder.restoreIP(OldIP);
3348 return LtGRFunc;
3349}
3350
3351Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3352 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3353 AttributeList FuncAttrs) {
3354 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3355 LLVMContext &Ctx = M.getContext();
3357 Builder.getVoidTy(),
3358 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3359 /* IsVarArg */ false);
3360 Function *LtGCFunc =
3362 "_omp_reduction_global_to_list_copy_func", &M);
3363 LtGCFunc->setAttributes(FuncAttrs);
3364 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3365 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3366 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3367
3368 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3369 Builder.SetInsertPoint(EntryBlock);
3370
3371 // Buffer: global reduction buffer.
3372 Argument *BufferArg = LtGCFunc->getArg(0);
3373 // Idx: index of the buffer.
3374 Argument *IdxArg = LtGCFunc->getArg(1);
3375 // ReduceList: thread local Reduce list.
3376 Argument *ReduceListArg = LtGCFunc->getArg(2);
3377
3378 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3379 BufferArg->getName() + ".addr");
3380 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3381 IdxArg->getName() + ".addr");
3382 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3383 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3384 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3385 BufferArgAlloca, Builder.getPtrTy(),
3386 BufferArgAlloca->getName() + ".ascast");
3387 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3388 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3389 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3390 ReduceListArgAlloca, Builder.getPtrTy(),
3391 ReduceListArgAlloca->getName() + ".ascast");
3392 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3393 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3394 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3395
3396 Value *LocalReduceList =
3397 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3398 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3399 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3400 Type *IndexTy = Builder.getIndexTy(
3401 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3402 for (auto En : enumerate(ReductionInfos)) {
3403 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3404 auto *RedListArrayTy =
3405 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3406 // Reduce element = LocalReduceList[i]
3407 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3408 RedListArrayTy, LocalReduceList,
3409 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3410 // elemptr = ((CopyType*)(elemptrptr)) + I
3411 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3412 // Global = Buffer.VD[Idx];
3413 Value *BufferVD =
3414 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3415 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3416 ReductionsBufferTy, BufferVD, 0, En.index());
3417
3418 switch (RI.EvaluationKind) {
3419 case EvalKind::Scalar: {
3420 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3421 Builder.CreateStore(TargetElement, ElemPtr);
3422 break;
3423 }
3424 case EvalKind::Complex: {
3425 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3426 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3427 Value *SrcReal = Builder.CreateLoad(
3428 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3429 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3430 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3431 Value *SrcImg = Builder.CreateLoad(
3432 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3433
3434 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3435 RI.ElementType, ElemPtr, 0, 0, ".realp");
3436 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3437 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3438 Builder.CreateStore(SrcReal, DestRealPtr);
3439 Builder.CreateStore(SrcImg, DestImgPtr);
3440 break;
3441 }
3442 case EvalKind::Aggregate: {
3443 Value *SizeVal =
3444 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3445 Builder.CreateMemCpy(
3446 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3447 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3448 SizeVal, false);
3449 break;
3450 }
3451 }
3452 }
3453
3454 Builder.CreateRetVoid();
3455 Builder.restoreIP(OldIP);
3456 return LtGCFunc;
3457}
3458
3459Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3460 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3461 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3462 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3463 LLVMContext &Ctx = M.getContext();
3464 auto *FuncTy = FunctionType::get(
3465 Builder.getVoidTy(),
3466 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3467 /* IsVarArg */ false);
3468 Function *LtGRFunc =
3470 "_omp_reduction_global_to_list_reduce_func", &M);
3471 LtGRFunc->setAttributes(FuncAttrs);
3472 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3473 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3474 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3475
3476 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3477 Builder.SetInsertPoint(EntryBlock);
3478
3479 // Buffer: global reduction buffer.
3480 Argument *BufferArg = LtGRFunc->getArg(0);
3481 // Idx: index of the buffer.
3482 Argument *IdxArg = LtGRFunc->getArg(1);
3483 // ReduceList: thread local Reduce list.
3484 Argument *ReduceListArg = LtGRFunc->getArg(2);
3485
3486 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3487 BufferArg->getName() + ".addr");
3488 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3489 IdxArg->getName() + ".addr");
3490 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3491 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3492 ArrayType *RedListArrayTy =
3493 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3494
3495 // 1. Build a list of reduction variables.
3496 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3497 Value *LocalReduceList =
3498 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3499
3500 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3501 BufferArgAlloca, Builder.getPtrTy(),
3502 BufferArgAlloca->getName() + ".ascast");
3503 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3504 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3505 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3506 ReduceListArgAlloca, Builder.getPtrTy(),
3507 ReduceListArgAlloca->getName() + ".ascast");
3508 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3509 LocalReduceList, Builder.getPtrTy(),
3510 LocalReduceList->getName() + ".ascast");
3511
3512 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3513 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3514 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3515
3516 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3517 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3518 Type *IndexTy = Builder.getIndexTy(
3519 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3520 for (auto En : enumerate(ReductionInfos)) {
3521 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3522 RedListArrayTy, ReductionList,
3523 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3524 // Global = Buffer.VD[Idx];
3525 Value *BufferVD =
3526 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3527 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3528 ReductionsBufferTy, BufferVD, 0, En.index());
3529 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3530 }
3531
3532 // Call reduce_function(ReduceList, GlobalReduceList)
3533 Value *ReduceList =
3534 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3535 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
3536 ->addFnAttr(Attribute::NoUnwind);
3537 Builder.CreateRetVoid();
3538 Builder.restoreIP(OldIP);
3539 return LtGRFunc;
3540}
3541
3542std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3543 std::string Suffix =
3544 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3545 return (Name + Suffix).str();
3546}
3547
3548Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3549 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3550 ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind,
3551 AttributeList FuncAttrs) {
3552 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3553 {Builder.getPtrTy(), Builder.getPtrTy()},
3554 /* IsVarArg */ false);
3555 std::string Name = getReductionFuncName(ReducerName);
3556 Function *ReductionFunc =
3558 ReductionFunc->setAttributes(FuncAttrs);
3559 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3560 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3561 BasicBlock *EntryBB =
3562 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3563 Builder.SetInsertPoint(EntryBB);
3564
3565 // Need to alloca memory here and deal with the pointers before getting
3566 // LHS/RHS pointers out
3567 Value *LHSArrayPtr = nullptr;
3568 Value *RHSArrayPtr = nullptr;
3569 Argument *Arg0 = ReductionFunc->getArg(0);
3570 Argument *Arg1 = ReductionFunc->getArg(1);
3571 Type *Arg0Type = Arg0->getType();
3572 Type *Arg1Type = Arg1->getType();
3573
3574 Value *LHSAlloca =
3575 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3576 Value *RHSAlloca =
3577 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3578 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3579 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3580 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3581 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3582 Builder.CreateStore(Arg0, LHSAddrCast);
3583 Builder.CreateStore(Arg1, RHSAddrCast);
3584 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3585 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3586
3587 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3588 Type *IndexTy = Builder.getIndexTy(
3589 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3590 SmallVector<Value *> LHSPtrs, RHSPtrs;
3591 for (auto En : enumerate(ReductionInfos)) {
3592 const ReductionInfo &RI = En.value();
3593 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3594 RedArrayTy, RHSArrayPtr,
3595 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3596 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3597 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3598 RHSI8Ptr, RI.PrivateVariable->getType(),
3599 RHSI8Ptr->getName() + ".ascast");
3600
3601 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3602 RedArrayTy, LHSArrayPtr,
3603 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3604 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3605 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3606 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3607
3608 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3609 LHSPtrs.emplace_back(LHSPtr);
3610 RHSPtrs.emplace_back(RHSPtr);
3611 } else {
3612 Value *LHS = LHSPtr;
3613 Value *RHS = RHSPtr;
3614
3615 if (!IsByRef.empty() && !IsByRef[En.index()]) {
3616 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3617 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3618 }
3619
3620 Value *Reduced;
3621 InsertPointOrErrorTy AfterIP =
3622 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3623 if (!AfterIP)
3624 return AfterIP.takeError();
3625 if (!Builder.GetInsertBlock())
3626 return ReductionFunc;
3627
3628 Builder.restoreIP(*AfterIP);
3629
3630 if (!IsByRef.empty() && !IsByRef[En.index()])
3631 Builder.CreateStore(Reduced, LHSPtr);
3632 }
3633 }
3634
3635 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3636 for (auto En : enumerate(ReductionInfos)) {
3637 unsigned Index = En.index();
3638 const ReductionInfo &RI = En.value();
3639 Value *LHSFixupPtr, *RHSFixupPtr;
3640 Builder.restoreIP(RI.ReductionGenClang(
3641 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3642
3643 // Fix the CallBack code genereated to use the correct Values for the LHS
3644 // and RHS
3645 LHSFixupPtr->replaceUsesWithIf(
3646 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3647 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3648 ReductionFunc;
3649 });
3650 RHSFixupPtr->replaceUsesWithIf(
3651 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3652 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3653 ReductionFunc;
3654 });
3655 }
3656
3657 Builder.CreateRetVoid();
3658 return ReductionFunc;
3659}
3660
3661static void
3663 bool IsGPU) {
3664 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3665 (void)RI;
3666 assert(RI.Variable && "expected non-null variable");
3667 assert(RI.PrivateVariable && "expected non-null private variable");
3668 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3669 "expected non-null reduction generator callback");
3670 if (!IsGPU) {
3671 assert(
3672 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3673 "expected variables and their private equivalents to have the same "
3674 "type");
3675 }
3676 assert(RI.Variable->getType()->isPointerTy() &&
3677 "expected variables to be pointers");
3678 }
3679}
3680
3681OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3682 const LocationDescription &Loc, InsertPointTy AllocaIP,
3683 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3684 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
3685 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3686 unsigned ReductionBufNum, Value *SrcLocInfo) {
3687 if (!updateToLocation(Loc))
3688 return InsertPointTy();
3689 Builder.restoreIP(CodeGenIP);
3690 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3691 LLVMContext &Ctx = M.getContext();
3692
3693 // Source location for the ident struct
3694 if (!SrcLocInfo) {
3695 uint32_t SrcLocStrSize;
3696 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3697 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3698 }
3699
3700 if (ReductionInfos.size() == 0)
3701 return Builder.saveIP();
3702
3703 BasicBlock *ContinuationBlock = nullptr;
3704 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3705 // Copied code from createReductions
3706 BasicBlock *InsertBlock = Loc.IP.getBlock();
3707 ContinuationBlock =
3708 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3709 InsertBlock->getTerminator()->eraseFromParent();
3710 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3711 }
3712
3713 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3714 AttributeList FuncAttrs;
3715 AttrBuilder AttrBldr(Ctx);
3716 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3717 AttrBldr.addAttribute(Attr);
3718 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3719 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3720
3721 CodeGenIP = Builder.saveIP();
3722 Expected<Function *> ReductionResult = createReductionFunction(
3723 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
3724 ReductionGenCBKind, FuncAttrs);
3725 if (!ReductionResult)
3726 return ReductionResult.takeError();
3727 Function *ReductionFunc = *ReductionResult;
3728 Builder.restoreIP(CodeGenIP);
3729
3730 // Set the grid value in the config needed for lowering later on
3731 if (GridValue.has_value())
3732 Config.setGridValue(GridValue.value());
3733 else
3734 Config.setGridValue(getGridValue(T, ReductionFunc));
3735
3736 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3737 // RedList, shuffle_reduce_func, interwarp_copy_func);
3738 // or
3739 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3740 Value *Res;
3741
3742 // 1. Build a list of reduction variables.
3743 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3744 auto Size = ReductionInfos.size();
3745 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
3746 Type *FuncPtrTy =
3747 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
3748 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3749 CodeGenIP = Builder.saveIP();
3750 Builder.restoreIP(AllocaIP);
3751 Value *ReductionListAlloca =
3752 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3753 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3754 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3755 Builder.restoreIP(CodeGenIP);
3756 Type *IndexTy = Builder.getIndexTy(
3757 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3758 for (auto En : enumerate(ReductionInfos)) {
3759 const ReductionInfo &RI = En.value();
3760 Value *ElemPtr = Builder.CreateInBoundsGEP(
3761 RedArrayTy, ReductionList,
3762 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3763
3764 Value *PrivateVar = RI.PrivateVariable;
3765 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3766 if (IsByRefElem)
3767 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
3768
3769 Value *CastElem =
3770 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
3771 Builder.CreateStore(CastElem, ElemPtr);
3772 }
3773 CodeGenIP = Builder.saveIP();
3774 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
3775 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
3776
3777 if (!SarFunc)
3778 return SarFunc.takeError();
3779
3780 Expected<Function *> CopyResult =
3781 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
3782 if (!CopyResult)
3783 return CopyResult.takeError();
3784 Function *WcFunc = *CopyResult;
3785 Builder.restoreIP(CodeGenIP);
3786
3787 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3788
3789 unsigned MaxDataSize = 0;
3790 SmallVector<Type *> ReductionTypeArgs;
3791 for (auto En : enumerate(ReductionInfos)) {
3792 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3793 if (Size > MaxDataSize)
3794 MaxDataSize = Size;
3795 ReductionTypeArgs.emplace_back(En.value().ElementType);
3796 }
3797 Value *ReductionDataSize =
3798 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3799 if (!IsTeamsReduction) {
3800 Value *SarFuncCast =
3801 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
3802 Value *WcFuncCast =
3803 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
3804 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3805 WcFuncCast};
3806 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3807 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3808 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
3809 } else {
3810 CodeGenIP = Builder.saveIP();
3811 StructType *ReductionsBufferTy = StructType::create(
3812 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3813 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3814 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3815 Function *LtGCFunc = emitListToGlobalCopyFunction(
3816 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3817 Function *LtGRFunc = emitListToGlobalReduceFunction(
3818 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3819 Function *GtLCFunc = emitGlobalToListCopyFunction(
3820 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3821 Function *GtLRFunc = emitGlobalToListReduceFunction(
3822 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3823 Builder.restoreIP(CodeGenIP);
3824
3825 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
3826 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3827
3828 Value *Args3[] = {SrcLocInfo,
3829 KernelTeamsReductionPtr,
3830 Builder.getInt32(ReductionBufNum),
3831 ReductionDataSize,
3832 RL,
3833 *SarFunc,
3834 WcFunc,
3835 LtGCFunc,
3836 LtGRFunc,
3837 GtLCFunc,
3838 GtLRFunc};
3839
3840 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3841 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3842 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
3843 }
3844
3845 // 5. Build if (res == 1)
3846 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3847 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3848 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3849 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3850
3851 // 6. Build then branch: where we have reduced values in the master
3852 // thread in each team.
3853 // __kmpc_end_reduce{_nowait}(<gtid>);
3854 // break;
3855 emitBlock(ThenBB, CurFunc);
3856
3857 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3858 for (auto En : enumerate(ReductionInfos)) {
3859 const ReductionInfo &RI = En.value();
3860 Type *ValueType = RI.ElementType;
3861 Value *RedValue = RI.Variable;
3862 Value *RHS =
3863 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3864
3865 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3866 Value *LHSPtr, *RHSPtr;
3867 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3868 &LHSPtr, &RHSPtr, CurFunc));
3869
3870 // Fix the CallBack code genereated to use the correct Values for the LHS
3871 // and RHS
3872 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
3873 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3874 ReductionFunc;
3875 });
3876 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3877 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3878 ReductionFunc;
3879 });
3880 } else {
3881 if (IsByRef.empty() || !IsByRef[En.index()]) {
3882 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3883 "red.value." + Twine(En.index()));
3884 }
3885 Value *PrivateRedValue = Builder.CreateLoad(
3886 ValueType, RHS, "red.private.value" + Twine(En.index()));
3887 Value *Reduced;
3888 InsertPointOrErrorTy AfterIP =
3889 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3890 if (!AfterIP)
3891 return AfterIP.takeError();
3892 Builder.restoreIP(*AfterIP);
3893
3894 if (!IsByRef.empty() && !IsByRef[En.index()])
3895 Builder.CreateStore(Reduced, RI.Variable);
3896 }
3897 }
3898 emitBlock(ExitBB, CurFunc);
3899 if (ContinuationBlock) {
3900 Builder.CreateBr(ContinuationBlock);
3901 Builder.SetInsertPoint(ContinuationBlock);
3902 }
3903 Config.setEmitLLVMUsed();
3904
3905 return Builder.saveIP();
3906}
3907
3909 Type *VoidTy = Type::getVoidTy(M.getContext());
3910 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3911 auto *FuncTy =
3912 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3914 ".omp.reduction.func", &M);
3915}
3916
3918 Function *ReductionFunc,
3920 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3921 Module *Module = ReductionFunc->getParent();
3922 BasicBlock *ReductionFuncBlock =
3923 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3924 Builder.SetInsertPoint(ReductionFuncBlock);
3925 Value *LHSArrayPtr = nullptr;
3926 Value *RHSArrayPtr = nullptr;
3927 if (IsGPU) {
3928 // Need to alloca memory here and deal with the pointers before getting
3929 // LHS/RHS pointers out
3930 //
3931 Argument *Arg0 = ReductionFunc->getArg(0);
3932 Argument *Arg1 = ReductionFunc->getArg(1);
3933 Type *Arg0Type = Arg0->getType();
3934 Type *Arg1Type = Arg1->getType();
3935
3936 Value *LHSAlloca =
3937 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3938 Value *RHSAlloca =
3939 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3940 Value *LHSAddrCast =
3941 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3942 Value *RHSAddrCast =
3943 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3944 Builder.CreateStore(Arg0, LHSAddrCast);
3945 Builder.CreateStore(Arg1, RHSAddrCast);
3946 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3947 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3948 } else {
3949 LHSArrayPtr = ReductionFunc->getArg(0);
3950 RHSArrayPtr = ReductionFunc->getArg(1);
3951 }
3952
3953 unsigned NumReductions = ReductionInfos.size();
3954 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3955
3956 for (auto En : enumerate(ReductionInfos)) {
3957 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3958 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3959 RedArrayTy, LHSArrayPtr, 0, En.index());
3960 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3961 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3962 LHSI8Ptr, RI.Variable->getType());
3963 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3964 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3965 RedArrayTy, RHSArrayPtr, 0, En.index());
3966 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3967 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3968 RHSI8Ptr, RI.PrivateVariable->getType());
3969 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3970 Value *Reduced;
3971 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3972 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3973 if (!AfterIP)
3974 return AfterIP.takeError();
3975
3976 Builder.restoreIP(*AfterIP);
3977 // TODO: Consider flagging an error.
3978 if (!Builder.GetInsertBlock())
3979 return Error::success();
3980
3981 // store is inside of the reduction region when using by-ref
3982 if (!IsByRef[En.index()])
3983 Builder.CreateStore(Reduced, LHSPtr);
3984 }
3985 Builder.CreateRetVoid();
3986 return Error::success();
3987}
3988
3989OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3990 const LocationDescription &Loc, InsertPointTy AllocaIP,
3991 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3992 bool IsNoWait, bool IsTeamsReduction) {
3993 assert(ReductionInfos.size() == IsByRef.size());
3994 if (Config.isGPU())
3995 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3996 IsByRef, IsNoWait, IsTeamsReduction);
3997
3998 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3999
4000 if (!updateToLocation(Loc))
4001 return InsertPointTy();
4002
4003 if (ReductionInfos.size() == 0)
4004 return Builder.saveIP();
4005
4006 BasicBlock *InsertBlock = Loc.IP.getBlock();
4007 BasicBlock *ContinuationBlock =
4008 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4009 InsertBlock->getTerminator()->eraseFromParent();
4010
4011 // Create and populate array of type-erased pointers to private reduction
4012 // values.
4013 unsigned NumReductions = ReductionInfos.size();
4014 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4015 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4016 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4017
4018 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4019
4020 for (auto En : enumerate(ReductionInfos)) {
4021 unsigned Index = En.index();
4022 const ReductionInfo &RI = En.value();
4023 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4024 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4025 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4026 }
4027
4028 // Emit a call to the runtime function that orchestrates the reduction.
4029 // Declare the reduction function in the process.
4030 Type *IndexTy = Builder.getIndexTy(
4031 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4032 Function *Func = Builder.GetInsertBlock()->getParent();
4033 Module *Module = Func->getParent();
4034 uint32_t SrcLocStrSize;
4035 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4036 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4037 return RI.AtomicReductionGen;
4038 });
4039 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4040 CanGenerateAtomic
4041 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4042 : IdentFlag(0));
4043 Value *ThreadId = getOrCreateThreadID(Ident);
4044 Constant *NumVariables = Builder.getInt32(NumReductions);
4045 const DataLayout &DL = Module->getDataLayout();
4046 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4047 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4048 Function *ReductionFunc = getFreshReductionFunc(*Module);
4049 Value *Lock = getOMPCriticalRegionLock(".reduction");
4050 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
4051 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4052 : RuntimeFunction::OMPRTL___kmpc_reduce);
4053 CallInst *ReduceCall =
4054 createRuntimeFunctionCall(ReduceFunc,
4055 {Ident, ThreadId, NumVariables, RedArraySize,
4056 RedArray, ReductionFunc, Lock},
4057 "reduce");
4058
4059 // Create final reduction entry blocks for the atomic and non-atomic case.
4060 // Emit IR that dispatches control flow to one of the blocks based on the
4061 // reduction supporting the atomic mode.
4062 BasicBlock *NonAtomicRedBlock =
4063 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4064 BasicBlock *AtomicRedBlock =
4065 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4067 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4068 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4069 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4070
4071 // Populate the non-atomic reduction using the elementwise reduction function.
4072 // This loads the elements from the global and private variables and reduces
4073 // them before storing back the result to the global variable.
4074 Builder.SetInsertPoint(NonAtomicRedBlock);
4075 for (auto En : enumerate(ReductionInfos)) {
4076 const ReductionInfo &RI = En.value();
4077 Type *ValueType = RI.ElementType;
4078 // We have one less load for by-ref case because that load is now inside of
4079 // the reduction region
4080 Value *RedValue = RI.Variable;
4081 if (!IsByRef[En.index()]) {
4082 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4083 "red.value." + Twine(En.index()));
4084 }
4085 Value *PrivateRedValue =
4086 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4087 "red.private.value." + Twine(En.index()));
4088 Value *Reduced;
4089 InsertPointOrErrorTy AfterIP =
4090 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4091 if (!AfterIP)
4092 return AfterIP.takeError();
4093 Builder.restoreIP(*AfterIP);
4094
4095 if (!Builder.GetInsertBlock())
4096 return InsertPointTy();
4097 // for by-ref case, the load is inside of the reduction region
4098 if (!IsByRef[En.index()])
4099 Builder.CreateStore(Reduced, RI.Variable);
4100 }
4101 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4102 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4103 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4104 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4105 Builder.CreateBr(ContinuationBlock);
4106
4107 // Populate the atomic reduction using the atomic elementwise reduction
4108 // function. There are no loads/stores here because they will be happening
4109 // inside the atomic elementwise reduction.
4110 Builder.SetInsertPoint(AtomicRedBlock);
4111 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4112 for (const ReductionInfo &RI : ReductionInfos) {
4113 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
4114 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4115 if (!AfterIP)
4116 return AfterIP.takeError();
4117 Builder.restoreIP(*AfterIP);
4118 if (!Builder.GetInsertBlock())
4119 return InsertPointTy();
4120 }
4121 Builder.CreateBr(ContinuationBlock);
4122 } else {
4123 Builder.CreateUnreachable();
4124 }
4125
4126 // Populate the outlined reduction function using the elementwise reduction
4127 // function. Partial values are extracted from the type-erased array of
4128 // pointers to private variables.
4129 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4130 IsByRef, /*isGPU=*/false);
4131 if (Err)
4132 return Err;
4133
4134 if (!Builder.GetInsertBlock())
4135 return InsertPointTy();
4136
4137 Builder.SetInsertPoint(ContinuationBlock);
4138 return Builder.saveIP();
4139}
4140
4141OpenMPIRBuilder::InsertPointOrErrorTy
4142OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4143 BodyGenCallbackTy BodyGenCB,
4144 FinalizeCallbackTy FiniCB) {
4145 if (!updateToLocation(Loc))
4146 return Loc.IP;
4147
4148 Directive OMPD = Directive::OMPD_master;
4149 uint32_t SrcLocStrSize;
4150 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4151 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4152 Value *ThreadId = getOrCreateThreadID(Ident);
4153 Value *Args[] = {Ident, ThreadId};
4154
4155 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4156 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4157
4158 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4159 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4160
4161 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4162 /*Conditional*/ true, /*hasFinalize*/ true);
4163}
4164
4165OpenMPIRBuilder::InsertPointOrErrorTy
4166OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4167 BodyGenCallbackTy BodyGenCB,
4168 FinalizeCallbackTy FiniCB, Value *Filter) {
4169 if (!updateToLocation(Loc))
4170 return Loc.IP;
4171
4172 Directive OMPD = Directive::OMPD_masked;
4173 uint32_t SrcLocStrSize;
4174 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4175 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4176 Value *ThreadId = getOrCreateThreadID(Ident);
4177 Value *Args[] = {Ident, ThreadId, Filter};
4178 Value *ArgsEnd[] = {Ident, ThreadId};
4179
4180 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4181 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4182
4183 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4184 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4185
4186 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4187 /*Conditional*/ true, /*hasFinalize*/ true);
4188}
4189
4191 llvm::FunctionCallee Callee,
4193 const llvm::Twine &Name) {
4194 llvm::CallInst *Call = Builder.CreateCall(
4195 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4196 Call->setDoesNotThrow();
4197 return Call;
4198}
4199
4200// Expects input basic block is dominated by BeforeScanBB.
4201// Once Scan directive is encountered, the code after scan directive should be
4202// dominated by AfterScanBB. Scan directive splits the code sequence to
4203// scan and input phase. Based on whether inclusive or exclusive
4204// clause is used in the scan directive and whether input loop or scan loop
4205// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4206// input loop and second is the scan loop. The code generated handles only
4207// inclusive scans now.
4208OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4209 const LocationDescription &Loc, InsertPointTy AllocaIP,
4210 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4211 bool IsInclusive, ScanInfo *ScanRedInfo) {
4212 if (ScanRedInfo->OMPFirstScanLoop) {
4213 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4214 ScanVarsType, ScanRedInfo);
4215 if (Err)
4216 return Err;
4217 }
4218 if (!updateToLocation(Loc))
4219 return Loc.IP;
4220
4221 llvm::Value *IV = ScanRedInfo->IV;
4222
4223 if (ScanRedInfo->OMPFirstScanLoop) {
4224 // Emit buffer[i] = red; at the end of the input phase.
4225 for (size_t i = 0; i < ScanVars.size(); i++) {
4226 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4227 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4228 Type *DestTy = ScanVarsType[i];
4229 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4230 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4231
4232 Builder.CreateStore(Src, Val);
4233 }
4234 }
4235 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4236 emitBlock(ScanRedInfo->OMPScanDispatch,
4237 Builder.GetInsertBlock()->getParent());
4238
4239 if (!ScanRedInfo->OMPFirstScanLoop) {
4240 IV = ScanRedInfo->IV;
4241 // Emit red = buffer[i]; at the entrance to the scan phase.
4242 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4243 for (size_t i = 0; i < ScanVars.size(); i++) {
4244 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4245 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4246 Type *DestTy = ScanVarsType[i];
4247 Value *SrcPtr =
4248 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4249 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4250 Builder.CreateStore(Src, ScanVars[i]);
4251 }
4252 }
4253
4254 // TODO: Update it to CreateBr and remove dead blocks
4255 llvm::Value *CmpI = Builder.getInt1(true);
4256 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4257 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4258 ScanRedInfo->OMPAfterScanBlock);
4259 } else {
4260 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4261 ScanRedInfo->OMPBeforeScanBlock);
4262 }
4263 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4264 Builder.GetInsertBlock()->getParent());
4265 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4266 return Builder.saveIP();
4267}
4268
4269Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4270 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4271 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4272
4273 Builder.restoreIP(AllocaIP);
4274 // Create the shared pointer at alloca IP.
4275 for (size_t i = 0; i < ScanVars.size(); i++) {
4276 llvm::Value *BuffPtr =
4277 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4278 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4279 }
4280
4281 // Allocate temporary buffer by master thread
4282 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4283 InsertPointTy CodeGenIP) -> Error {
4284 Builder.restoreIP(CodeGenIP);
4285 Value *AllocSpan =
4286 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4287 for (size_t i = 0; i < ScanVars.size(); i++) {
4288 Type *IntPtrTy = Builder.getInt32Ty();
4289 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4290 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4291 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4292 AllocSpan, nullptr, "arr");
4293 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4294 }
4295 return Error::success();
4296 };
4297 // TODO: Perform finalization actions for variables. This has to be
4298 // called for variables which have destructors/finalizers.
4299 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4300
4301 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4302 llvm::Value *FilterVal = Builder.getInt32(0);
4303 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4304 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4305
4306 if (!AfterIP)
4307 return AfterIP.takeError();
4308 Builder.restoreIP(*AfterIP);
4309 BasicBlock *InputBB = Builder.GetInsertBlock();
4310 if (InputBB->getTerminator())
4311 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4312 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4313 if (!AfterIP)
4314 return AfterIP.takeError();
4315 Builder.restoreIP(*AfterIP);
4316
4317 return Error::success();
4318}
4319
4320Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4321 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4322 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4323 InsertPointTy CodeGenIP) -> Error {
4324 Builder.restoreIP(CodeGenIP);
4325 for (ReductionInfo RedInfo : ReductionInfos) {
4326 Value *PrivateVar = RedInfo.PrivateVariable;
4327 Value *OrigVar = RedInfo.Variable;
4328 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4329 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4330
4331 Type *SrcTy = RedInfo.ElementType;
4332 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4333 "arrayOffset");
4334 Value *Src = Builder.CreateLoad(SrcTy, Val);
4335
4336 Builder.CreateStore(Src, OrigVar);
4337 Builder.CreateFree(Buff);
4338 }
4339 return Error::success();
4340 };
4341 // TODO: Perform finalization actions for variables. This has to be
4342 // called for variables which have destructors/finalizers.
4343 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4344
4345 if (ScanRedInfo->OMPScanFinish->getTerminator())
4346 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4347 else
4348 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4349
4350 llvm::Value *FilterVal = Builder.getInt32(0);
4351 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4352 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4353
4354 if (!AfterIP)
4355 return AfterIP.takeError();
4356 Builder.restoreIP(*AfterIP);
4357 BasicBlock *InputBB = Builder.GetInsertBlock();
4358 if (InputBB->getTerminator())
4359 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4360 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4361 if (!AfterIP)
4362 return AfterIP.takeError();
4363 Builder.restoreIP(*AfterIP);
4364 return Error::success();
4365}
4366
4367OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4368 const LocationDescription &Loc,
4370 ScanInfo *ScanRedInfo) {
4371
4372 if (!updateToLocation(Loc))
4373 return Loc.IP;
4374 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4375 InsertPointTy CodeGenIP) -> Error {
4376 Builder.restoreIP(CodeGenIP);
4377 Function *CurFn = Builder.GetInsertBlock()->getParent();
4378 // for (int k = 0; k <= ceil(log2(n)); ++k)
4379 llvm::BasicBlock *LoopBB =
4380 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4381 llvm::BasicBlock *ExitBB =
4382 splitBB(Builder, false, "omp.outer.log.scan.exit");
4384 Builder.GetInsertBlock()->getModule(),
4385 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4386 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4387 llvm::Value *Arg =
4388 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4389 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4391 Builder.GetInsertBlock()->getModule(),
4392 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4393 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4394 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4395 llvm::Value *NMin1 = Builder.CreateNUWSub(
4396 ScanRedInfo->Span,
4397 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4398 Builder.SetInsertPoint(InputBB);
4399 Builder.CreateBr(LoopBB);
4400 emitBlock(LoopBB, CurFn);
4401 Builder.SetInsertPoint(LoopBB);
4402
4403 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4404 // size pow2k = 1;
4405 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4406 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4407 InputBB);
4408 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4409 InputBB);
4410 // for (size i = n - 1; i >= 2 ^ k; --i)
4411 // tmp[i] op= tmp[i-pow2k];
4412 llvm::BasicBlock *InnerLoopBB =
4413 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4414 llvm::BasicBlock *InnerExitBB =
4415 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4416 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4417 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4418 emitBlock(InnerLoopBB, CurFn);
4419 Builder.SetInsertPoint(InnerLoopBB);
4420 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4421 IVal->addIncoming(NMin1, LoopBB);
4422 for (ReductionInfo RedInfo : ReductionInfos) {
4423 Value *ReductionVal = RedInfo.PrivateVariable;
4424 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4425 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4426 Type *DestTy = RedInfo.ElementType;
4427 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4428 Value *LHSPtr =
4429 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4430 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4431 Value *RHSPtr =
4432 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4433 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4434 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4436 InsertPointOrErrorTy AfterIP =
4437 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4438 if (!AfterIP)
4439 return AfterIP.takeError();
4440 Builder.CreateStore(Result, LHSPtr);
4441 }
4442 llvm::Value *NextIVal = Builder.CreateNUWSub(
4443 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4444 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4445 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4446 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4447 emitBlock(InnerExitBB, CurFn);
4448 llvm::Value *Next = Builder.CreateNUWAdd(
4449 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4450 Counter->addIncoming(Next, Builder.GetInsertBlock());
4451 // pow2k <<= 1;
4452 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4453 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4454 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4455 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4456 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4457 return Error::success();
4458 };
4459
4460 // TODO: Perform finalization actions for variables. This has to be
4461 // called for variables which have destructors/finalizers.
4462 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4463
4464 llvm::Value *FilterVal = Builder.getInt32(0);
4465 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4466 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4467
4468 if (!AfterIP)
4469 return AfterIP.takeError();
4470 Builder.restoreIP(*AfterIP);
4471 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4472
4473 if (!AfterIP)
4474 return AfterIP.takeError();
4475 Builder.restoreIP(*AfterIP);
4476 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4477 if (Err)
4478 return Err;
4479
4480 return AfterIP;
4481}
4482
4483Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4484 llvm::function_ref<Error()> InputLoopGen,
4485 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4486 ScanInfo *ScanRedInfo) {
4487
4488 {
4489 // Emit loop with input phase:
4490 // for (i: 0..<num_iters>) {
4491 // <input phase>;
4492 // buffer[i] = red;
4493 // }
4494 ScanRedInfo->OMPFirstScanLoop = true;
4495 Error Err = InputLoopGen();
4496 if (Err)
4497 return Err;
4498 }
4499 {
4500 // Emit loop with scan phase:
4501 // for (i: 0..<num_iters>) {
4502 // red = buffer[i];
4503 // <scan phase>;
4504 // }
4505 ScanRedInfo->OMPFirstScanLoop = false;
4506 Error Err = ScanLoopGen(Builder.saveIP());
4507 if (Err)
4508 return Err;
4509 }
4510 return Error::success();
4511}
4512
4513void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4514 Function *Fun = Builder.GetInsertBlock()->getParent();
4515 ScanRedInfo->OMPScanDispatch =
4516 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4517 ScanRedInfo->OMPAfterScanBlock =
4518 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4519 ScanRedInfo->OMPBeforeScanBlock =
4520 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4521 ScanRedInfo->OMPScanLoopExit =
4522 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4523}
4524CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4525 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4526 BasicBlock *PostInsertBefore, const Twine &Name) {
4527 Module *M = F->getParent();
4528 LLVMContext &Ctx = M->getContext();
4529 Type *IndVarTy = TripCount->getType();
4530
4531 // Create the basic block structure.
4532 BasicBlock *Preheader =
4533 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4534 BasicBlock *Header =
4535 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4536 BasicBlock *Cond =
4537 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4538 BasicBlock *Body =
4539 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4540 BasicBlock *Latch =
4541 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4542 BasicBlock *Exit =
4543 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4544 BasicBlock *After =
4545 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4546
4547 // Use specified DebugLoc for new instructions.
4548 Builder.SetCurrentDebugLocation(DL);
4549
4550 Builder.SetInsertPoint(Preheader);
4551 Builder.CreateBr(Header);
4552
4553 Builder.SetInsertPoint(Header);
4554 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4555 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4556 Builder.CreateBr(Cond);
4557
4558 Builder.SetInsertPoint(Cond);
4559 Value *Cmp =
4560 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4561 Builder.CreateCondBr(Cmp, Body, Exit);
4562
4563 Builder.SetInsertPoint(Body);
4564 Builder.CreateBr(Latch);
4565
4566 Builder.SetInsertPoint(Latch);
4567 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4568 "omp_" + Name + ".next", /*HasNUW=*/true);
4569 Builder.CreateBr(Header);
4570 IndVarPHI->addIncoming(Next, Latch);
4571
4572 Builder.SetInsertPoint(Exit);
4573 Builder.CreateBr(After);
4574
4575 // Remember and return the canonical control flow.
4576 LoopInfos.emplace_front();
4577 CanonicalLoopInfo *CL = &LoopInfos.front();
4578
4579 CL->Header = Header;
4580 CL->Cond = Cond;
4581 CL->Latch = Latch;
4582 CL->Exit = Exit;
4583
4584#ifndef NDEBUG
4585 CL->assertOK();
4586#endif
4587 return CL;
4588}
4589
4591OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4592 LoopBodyGenCallbackTy BodyGenCB,
4593 Value *TripCount, const Twine &Name) {
4594 BasicBlock *BB = Loc.IP.getBlock();
4595 BasicBlock *NextBB = BB->getNextNode();
4596
4597 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4598 NextBB, NextBB, Name);
4599 BasicBlock *After = CL->getAfter();
4600
4601 // If location is not set, don't connect the loop.
4602 if (updateToLocation(Loc)) {
4603 // Split the loop at the insertion point: Branch to the preheader and move
4604 // every following instruction to after the loop (the After BB). Also, the
4605 // new successor is the loop's after block.
4606 spliceBB(Builder, After, /*CreateBranch=*/false);
4607 Builder.CreateBr(CL->getPreheader());
4608 }
4609
4610 // Emit the body content. We do it after connecting the loop to the CFG to
4611 // avoid that the callback encounters degenerate BBs.
4612 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4613 return Err;
4614
4615#ifndef NDEBUG
4616 CL->assertOK();
4617#endif
4618 return CL;
4619}
4620
4621Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4622 ScanInfos.emplace_front();
4623 ScanInfo *Result = &ScanInfos.front();
4624 return Result;
4625}
4626
4628OpenMPIRBuilder::createCanonicalScanLoops(
4629 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4630 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4631 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4632 LocationDescription ComputeLoc =
4633 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4634 updateToLocation(ComputeLoc);
4635
4637
4638 Value *TripCount = calculateCanonicalLoopTripCount(
4639 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4640 ScanRedInfo->Span = TripCount;
4641 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4642 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4643
4644 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4645 Builder.restoreIP(CodeGenIP);
4646 ScanRedInfo->IV = IV;
4647 createScanBBs(ScanRedInfo);
4648 BasicBlock *InputBlock = Builder.GetInsertBlock();
4649 Instruction *Terminator = InputBlock->getTerminator();
4650 assert(Terminator->getNumSuccessors() == 1);
4651 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4652 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4653 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4654 Builder.GetInsertBlock()->getParent());
4655 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4656 emitBlock(ScanRedInfo->OMPScanLoopExit,
4657 Builder.GetInsertBlock()->getParent());
4658 Builder.CreateBr(ContinueBlock);
4659 Builder.SetInsertPoint(
4660 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4661 return BodyGenCB(Builder.saveIP(), IV);
4662 };
4663
4664 const auto &&InputLoopGen = [&]() -> Error {
4665 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4666 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4667 ComputeIP, Name, true, ScanRedInfo);
4668 if (!LoopInfo)
4669 return LoopInfo.takeError();
4670 Result.push_back(*LoopInfo);
4671 Builder.restoreIP((*LoopInfo)->getAfterIP());
4672 return Error::success();
4673 };
4674 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4676 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4677 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4678 if (!LoopInfo)
4679 return LoopInfo.takeError();
4680 Result.push_back(*LoopInfo);
4681 Builder.restoreIP((*LoopInfo)->getAfterIP());
4682 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4683 return Error::success();
4684 };
4685 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4686 if (Err)
4687 return Err;
4688 return Result;
4689}
4690
4691Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4692 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4693 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4694
4695 // Consider the following difficulties (assuming 8-bit signed integers):
4696 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4697 // DO I = 1, 100, 50
4698 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4699 // DO I = 100, 0, -128
4700
4701 // Start, Stop and Step must be of the same integer type.
4702 auto *IndVarTy = cast<IntegerType>(Start->getType());
4703 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4704 assert(IndVarTy == Step->getType() && "Step type mismatch");
4705
4706 updateToLocation(Loc);
4707
4708 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4709 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4710
4711 // Like Step, but always positive.
4712 Value *Incr = Step;
4713
4714 // Distance between Start and Stop; always positive.
4715 Value *Span;
4716
4717 // Condition whether there are no iterations are executed at all, e.g. because
4718 // UB < LB.
4719 Value *ZeroCmp;
4720
4721 if (IsSigned) {
4722 // Ensure that increment is positive. If not, negate and invert LB and UB.
4723 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4724 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4725 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4726 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4727 Span = Builder.CreateSub(UB, LB, "", false, true);
4728 ZeroCmp = Builder.CreateICmp(
4729 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4730 } else {
4731 Span = Builder.CreateSub(Stop, Start, "", true);
4732 ZeroCmp = Builder.CreateICmp(
4733 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4734 }
4735
4736 Value *CountIfLooping;
4737 if (InclusiveStop) {
4738 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4739 } else {
4740 // Avoid incrementing past stop since it could overflow.
4741 Value *CountIfTwo = Builder.CreateAdd(
4742 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4743 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4744 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4745 }
4746
4747 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4748 "omp_" + Name + ".tripcount");
4749}
4750
4751Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4752 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4753 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4754 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4755 ScanInfo *ScanRedInfo) {
4756 LocationDescription ComputeLoc =
4757 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4758
4759 Value *TripCount = calculateCanonicalLoopTripCount(
4760 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4761
4762 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4763 Builder.restoreIP(CodeGenIP);
4764 Value *Span = Builder.CreateMul(IV, Step);
4765 Value *IndVar = Builder.CreateAdd(Span, Start);
4766 if (InScan)
4767 ScanRedInfo->IV = IndVar;
4768 return BodyGenCB(Builder.saveIP(), IndVar);
4769 };
4770 LocationDescription LoopLoc =
4771 ComputeIP.isSet()
4772 ? Loc
4773 : LocationDescription(Builder.saveIP(),
4774 Builder.getCurrentDebugLocation());
4775 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4776}
4777
4778// Returns an LLVM function to call for initializing loop bounds using OpenMP
4779// static scheduling for composite `distribute parallel for` depending on
4780// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4781// integers as unsigned similarly to CanonicalLoopInfo.
4782static FunctionCallee
4784 OpenMPIRBuilder &OMPBuilder) {
4785 unsigned Bitwidth = Ty->getIntegerBitWidth();
4786 if (Bitwidth == 32)
4787 return OMPBuilder.getOrCreateRuntimeFunction(
4788 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4789 if (Bitwidth == 64)
4790 return OMPBuilder.getOrCreateRuntimeFunction(
4791 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4792 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4793}
4794
4795// Returns an LLVM function to call for initializing loop bounds using OpenMP
4796// static scheduling depending on `type`. Only i32 and i64 are supported by the
4797// runtime. Always interpret integers as unsigned similarly to
4798// CanonicalLoopInfo.
4800 OpenMPIRBuilder &OMPBuilder) {
4801 unsigned Bitwidth = Ty->getIntegerBitWidth();
4802 if (Bitwidth == 32)
4803 return OMPBuilder.getOrCreateRuntimeFunction(
4804 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4805 if (Bitwidth == 64)
4806 return OMPBuilder.getOrCreateRuntimeFunction(
4807 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4808 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4809}
4810
4811OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4812 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4813 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
4814 OMPScheduleType DistScheduleSchedType) {
4815 assert(CLI->isValid() && "Requires a valid canonical loop");
4816 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4817 "Require dedicated allocate IP");
4818
4819 // Set up the source location value for OpenMP runtime.
4820 Builder.restoreIP(CLI->getPreheaderIP());
4821 Builder.SetCurrentDebugLocation(DL);
4822
4823 uint32_t SrcLocStrSize;
4824 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4825 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4826
4827 // Declare useful OpenMP runtime functions.
4828 Value *IV = CLI->getIndVar();
4829 Type *IVTy = IV->getType();
4830 FunctionCallee StaticInit =
4831 LoopType == WorksharingLoopType::DistributeForStaticLoop
4832 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4833 : getKmpcForStaticInitForType(IVTy, M, *this);
4834 FunctionCallee StaticFini =
4835 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4836
4837 // Allocate space for computed loop bounds as expected by the "init" function.
4838 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4839
4840 Type *I32Type = Type::getInt32Ty(M.getContext());
4841 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4842 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4843 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4844 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4845 CLI->setLastIter(PLastIter);
4846
4847 // At the end of the preheader, prepare for calling the "init" function by
4848 // storing the current loop bounds into the allocated space. A canonical loop
4849 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4850 // and produces an inclusive upper bound.
4851 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4852 Constant *Zero = ConstantInt::get(IVTy, 0);
4853 Constant *One = ConstantInt::get(IVTy, 1);
4854 Builder.CreateStore(Zero, PLowerBound);
4855 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4856 Builder.CreateStore(UpperBound, PUpperBound);
4857 Builder.CreateStore(One, PStride);
4858
4859 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4860
4861 OMPScheduleType SchedType =
4862 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4863 ? OMPScheduleType::OrderedDistribute
4865 Constant *SchedulingType =
4866 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4867
4868 // Call the "init" function and update the trip count of the loop with the
4869 // value it produced.
4870 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
4871 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
4872 this](Value *SchedulingType, auto &Builder) {
4873 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
4874 PLowerBound, PUpperBound});
4875 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4876 Value *PDistUpperBound =
4877 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4878 Args.push_back(PDistUpperBound);
4879 }
4880 Args.append({PStride, One, Zero});
4881 createRuntimeFunctionCall(StaticInit, Args);
4882 };
4883 BuildInitCall(SchedulingType, Builder);
4884 if (HasDistSchedule &&
4885 LoopType != WorksharingLoopType::DistributeStaticLoop) {
4886 Constant *DistScheduleSchedType = ConstantInt::get(
4887 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
4888 // We want to emit a second init function call for the dist_schedule clause
4889 // to the Distribute construct. This should only be done however if a
4890 // Workshare Loop is nested within a Distribute Construct
4891 BuildInitCall(DistScheduleSchedType, Builder);
4892 }
4893 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4894 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4895 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4896 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4897 CLI->setTripCount(TripCount);
4898
4899 // Update all uses of the induction variable except the one in the condition
4900 // block that compares it with the actual upper bound, and the increment in
4901 // the latch block.
4902
4903 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4904 Builder.SetInsertPoint(CLI->getBody(),
4905 CLI->getBody()->getFirstInsertionPt());
4906 Builder.SetCurrentDebugLocation(DL);
4907 return Builder.CreateAdd(OldIV, LowerBound);
4908 });
4909
4910 // In the "exit" block, call the "fini" function.
4911 Builder.SetInsertPoint(CLI->getExit(),
4912 CLI->getExit()->getTerminator()->getIterator());
4913 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
4914
4915 // Add the barrier if requested.
4916 if (NeedsBarrier) {
4917 InsertPointOrErrorTy BarrierIP =
4918 createBarrier(LocationDescription(Builder.saveIP(), DL),
4919 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4920 /* CheckCancelFlag */ false);
4921 if (!BarrierIP)
4922 return BarrierIP.takeError();
4923 }
4924
4925 InsertPointTy AfterIP = CLI->getAfterIP();
4926 CLI->invalidate();
4927
4928 return AfterIP;
4929}
4930
4931static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
4932 LoopInfo &LI);
4933static void addLoopMetadata(CanonicalLoopInfo *Loop,
4934 ArrayRef<Metadata *> Properties);
4935
4936static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI,
4937 LLVMContext &Ctx, Loop *Loop,
4939 SmallVector<Metadata *> &LoopMDList) {
4940 SmallSet<BasicBlock *, 8> Reachable;
4941
4942 // Get the basic blocks from the loop in which memref instructions
4943 // can be found.
4944 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
4945 // preferably without running any passes.
4946 for (BasicBlock *Block : Loop->getBlocks()) {
4947 if (Block == CLI->getCond() || Block == CLI->getHeader())
4948 continue;
4949 Reachable.insert(Block);
4950 }
4951
4952 // Add access group metadata to memory-access instructions.
4953 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
4954 for (BasicBlock *BB : Reachable)
4955 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
4956 // TODO: If the loop has existing parallel access metadata, have
4957 // to combine two lists.
4958 LoopMDList.push_back(MDNode::get(
4959 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
4960}
4961
4962OpenMPIRBuilder::InsertPointOrErrorTy
4963OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4964 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4965 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
4966 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
4967 assert(CLI->isValid() && "Requires a valid canonical loop");
4968 assert(ChunkSize || DistScheduleChunkSize && "Chunk size is required");
4969
4970 LLVMContext &Ctx = CLI->getFunction()->getContext();
4971 Value *IV = CLI->getIndVar();
4972 Value *OrigTripCount = CLI->getTripCount();
4973 Type *IVTy = IV->getType();
4974 assert(IVTy->getIntegerBitWidth() <= 64 &&
4975 "Max supported tripcount bitwidth is 64 bits");
4976 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4977 : Type::getInt64Ty(Ctx);
4978 Type *I32Type = Type::getInt32Ty(M.getContext());
4979 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4980 Constant *One = ConstantInt::get(InternalIVTy, 1);
4981
4982 Function *F = CLI->getFunction();
4984 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
4985 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
4986 LoopAnalysis LIA;
4987 LoopInfo &&LI = LIA.run(*F, FAM);
4988 Loop *L = LI.getLoopFor(CLI->getHeader());
4989 SmallVector<Metadata *> LoopMDList;
4990 if (ChunkSize || DistScheduleChunkSize)
4991 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
4992 addLoopMetadata(CLI, LoopMDList);
4993
4994 // Declare useful OpenMP runtime functions.
4995 FunctionCallee StaticInit =
4996 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4997 FunctionCallee StaticFini =
4998 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4999
5000 // Allocate space for computed loop bounds as expected by the "init" function.
5001 Builder.restoreIP(AllocaIP);
5002 Builder.SetCurrentDebugLocation(DL);
5003 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5004 Value *PLowerBound =
5005 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5006 Value *PUpperBound =
5007 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5008 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5009 CLI->setLastIter(PLastIter);
5010
5011 // Set up the source location value for the OpenMP runtime.
5012 Builder.restoreIP(CLI->getPreheaderIP());
5013 Builder.SetCurrentDebugLocation(DL);
5014
5015 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5016 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5017 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5018 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5019 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5020 "distschedulechunksize");
5021 Value *CastedTripCount =
5022 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5023
5024 Constant *SchedulingType =
5025 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5026 Constant *DistSchedulingType =
5027 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5028 Builder.CreateStore(Zero, PLowerBound);
5029 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5030 Builder.CreateStore(OrigUpperBound, PUpperBound);
5031 Builder.CreateStore(One, PStride);
5032
5033 // Call the "init" function and update the trip count of the loop with the
5034 // value it produced.
5035 uint32_t SrcLocStrSize;
5036 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5037 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5038 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5039 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5040 PUpperBound, PStride, One,
5041 this](Value *SchedulingType, Value *ChunkSize,
5042 auto &Builder) {
5043 createRuntimeFunctionCall(
5044 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5045 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5046 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5047 /*pstride=*/PStride, /*incr=*/One,
5048 /*chunk=*/ChunkSize});
5049 };
5050 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5051 if (DistScheduleSchedType != OMPScheduleType::None &&
5052 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5053 SchedType != OMPScheduleType::OrderedDistribute) {
5054 // We want to emit a second init function call for the dist_schedule clause
5055 // to the Distribute construct. This should only be done however if a
5056 // Workshare Loop is nested within a Distribute Construct
5057 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5058 }
5059
5060 // Load values written by the "init" function.
5061 Value *FirstChunkStart =
5062 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5063 Value *FirstChunkStop =
5064 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5065 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5066 Value *ChunkRange =
5067 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5068 Value *NextChunkStride =
5069 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5070
5071 // Create outer "dispatch" loop for enumerating the chunks.
5072 BasicBlock *DispatchEnter = splitBB(Builder, true);
5073 Value *DispatchCounter;
5074
5075 // It is safe to assume this didn't return an error because the callback
5076 // passed into createCanonicalLoop is the only possible error source, and it
5077 // always returns success.
5078 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5079 {Builder.saveIP(), DL},
5080 [&](InsertPointTy BodyIP, Value *Counter) {
5081 DispatchCounter = Counter;
5082 return Error::success();
5083 },
5084 FirstChunkStart, CastedTripCount, NextChunkStride,
5085 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5086 "dispatch"));
5087
5088 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5089 // not have to preserve the canonical invariant.
5090 BasicBlock *DispatchBody = DispatchCLI->getBody();
5091 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5092 BasicBlock *DispatchExit = DispatchCLI->getExit();
5093 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5094 DispatchCLI->invalidate();
5095
5096 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5097 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5098 redirectTo(CLI->getExit(), DispatchLatch, DL);
5099 redirectTo(DispatchBody, DispatchEnter, DL);
5100
5101 // Prepare the prolog of the chunk loop.
5102 Builder.restoreIP(CLI->getPreheaderIP());
5103 Builder.SetCurrentDebugLocation(DL);
5104
5105 // Compute the number of iterations of the chunk loop.
5106 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5107 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5108 Value *IsLastChunk =
5109 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5110 Value *CountUntilOrigTripCount =
5111 Builder.CreateSub(CastedTripCount, DispatchCounter);
5112 Value *ChunkTripCount = Builder.CreateSelect(
5113 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5114 Value *BackcastedChunkTC =
5115 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5116 CLI->setTripCount(BackcastedChunkTC);
5117
5118 // Update all uses of the induction variable except the one in the condition
5119 // block that compares it with the actual upper bound, and the increment in
5120 // the latch block.
5121 Value *BackcastedDispatchCounter =
5122 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5123 CLI->mapIndVar([&](Instruction *) -> Value * {
5124 Builder.restoreIP(CLI->getBodyIP());
5125 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5126 });
5127
5128 // In the "exit" block, call the "fini" function.
5129 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5130 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5131
5132 // Add the barrier if requested.
5133 if (NeedsBarrier) {
5134 InsertPointOrErrorTy AfterIP =
5135 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5136 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5137 if (!AfterIP)
5138 return AfterIP.takeError();
5139 }
5140
5141#ifndef NDEBUG
5142 // Even though we currently do not support applying additional methods to it,
5143 // the chunk loop should remain a canonical loop.
5144 CLI->assertOK();
5145#endif
5146
5147 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5148}
5149
5150// Returns an LLVM function to call for executing an OpenMP static worksharing
5151// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5152// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5153static FunctionCallee
5154getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
5155 WorksharingLoopType LoopType) {
5156 unsigned Bitwidth = Ty->getIntegerBitWidth();
5157 Module &M = OMPBuilder->M;
5158 switch (LoopType) {
5159 case WorksharingLoopType::ForStaticLoop:
5160 if (Bitwidth == 32)
5161 return OMPBuilder->getOrCreateRuntimeFunction(
5162 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5163 if (Bitwidth == 64)
5164 return OMPBuilder->getOrCreateRuntimeFunction(
5165 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5166 break;
5167 case WorksharingLoopType::DistributeStaticLoop:
5168 if (Bitwidth == 32)
5169 return OMPBuilder->getOrCreateRuntimeFunction(
5170 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5171 if (Bitwidth == 64)
5172 return OMPBuilder->getOrCreateRuntimeFunction(
5173 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5174 break;
5175 case WorksharingLoopType::DistributeForStaticLoop:
5176 if (Bitwidth == 32)
5177 return OMPBuilder->getOrCreateRuntimeFunction(
5178 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5179 if (Bitwidth == 64)
5180 return OMPBuilder->getOrCreateRuntimeFunction(
5181 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5182 break;
5183 }
5184 if (Bitwidth != 32 && Bitwidth != 64) {
5185 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5186 }
5187 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5188}
5189
5190// Inserts a call to proper OpenMP Device RTL function which handles
5191// loop worksharing.
5192static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
5193 WorksharingLoopType LoopType,
5194 BasicBlock *InsertBlock, Value *Ident,
5195 Value *LoopBodyArg, Value *TripCount,
5196 Function &LoopBodyFn, bool NoLoop) {
5197 Type *TripCountTy = TripCount->getType();
5198 Module &M = OMPBuilder->M;
5199 IRBuilder<> &Builder = OMPBuilder->Builder;
5200 FunctionCallee RTLFn =
5201 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5202 SmallVector<Value *, 8> RealArgs;
5203 RealArgs.push_back(Ident);
5204 RealArgs.push_back(&LoopBodyFn);
5205 RealArgs.push_back(LoopBodyArg);
5206 RealArgs.push_back(TripCount);
5207 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5208 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5209 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5210 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5211 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5212 return;
5213 }
5214 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5215 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5216 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5217 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5218
5219 RealArgs.push_back(
5220 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5221 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5222 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5223 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5224 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5225 } else {
5226 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5227 }
5228
5229 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5230}
5231
5233 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5234 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5235 WorksharingLoopType LoopType, bool NoLoop) {
5236 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5237 BasicBlock *Preheader = CLI->getPreheader();
5238 Value *TripCount = CLI->getTripCount();
5239
5240 // After loop body outling, the loop body contains only set up
5241 // of loop body argument structure and the call to the outlined
5242 // loop body function. Firstly, we need to move setup of loop body args
5243 // into loop preheader.
5244 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5245 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5246
5247 // The next step is to remove the whole loop. We do not it need anymore.
5248 // That's why make an unconditional branch from loop preheader to loop
5249 // exit block
5250 Builder.restoreIP({Preheader, Preheader->end()});
5251 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5252 Preheader->getTerminator()->eraseFromParent();
5253 Builder.CreateBr(CLI->getExit());
5254
5255 // Delete dead loop blocks
5256 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5257 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5258 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5259 CleanUpInfo.EntryBB = CLI->getHeader();
5260 CleanUpInfo.ExitBB = CLI->getExit();
5261 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5262 DeleteDeadBlocks(BlocksToBeRemoved);
5263
5264 // Find the instruction which corresponds to loop body argument structure
5265 // and remove the call to loop body function instruction.
5266 Value *LoopBodyArg;
5267 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5268 assert(OutlinedFnUser &&
5269 "Expected unique undroppable user of outlined function");
5270 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5271 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5272 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5273 "Expected outlined function call to be located in loop preheader");
5274 // Check in case no argument structure has been passed.
5275 if (OutlinedFnCallInstruction->arg_size() > 1)
5276 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5277 else
5278 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5279 OutlinedFnCallInstruction->eraseFromParent();
5280
5281 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5282 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5283
5284 for (auto &ToBeDeletedItem : ToBeDeleted)
5285 ToBeDeletedItem->eraseFromParent();
5286 CLI->invalidate();
5287}
5288
5289OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5290 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5291 WorksharingLoopType LoopType, bool NoLoop) {
5292 uint32_t SrcLocStrSize;
5293 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5294 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5295
5296 OutlineInfo OI;
5297 OI.OuterAllocaBB = CLI->getPreheader();
5298 Function *OuterFn = CLI->getPreheader()->getParent();
5299
5300 // Instructions which need to be deleted at the end of code generation
5302
5303 OI.OuterAllocaBB = AllocaIP.getBlock();
5304
5305 // Mark the body loop as region which needs to be extracted
5306 OI.EntryBB = CLI->getBody();
5307 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5308 "omp.prelatch", true);
5309
5310 // Prepare loop body for extraction
5311 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5312
5313 // Insert new loop counter variable which will be used only in loop
5314 // body.
5315 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5316 Instruction *NewLoopCntLoad =
5317 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5318 // New loop counter instructions are redundant in the loop preheader when
5319 // code generation for workshare loop is finshed. That's why mark them as
5320 // ready for deletion.
5321 ToBeDeleted.push_back(NewLoopCntLoad);
5322 ToBeDeleted.push_back(NewLoopCnt);
5323
5324 // Analyse loop body region. Find all input variables which are used inside
5325 // loop body region.
5326 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5328 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5329
5330 CodeExtractorAnalysisCache CEAC(*OuterFn);
5331 CodeExtractor Extractor(Blocks,
5332 /* DominatorTree */ nullptr,
5333 /* AggregateArgs */ true,
5334 /* BlockFrequencyInfo */ nullptr,
5335 /* BranchProbabilityInfo */ nullptr,
5336 /* AssumptionCache */ nullptr,
5337 /* AllowVarArgs */ true,
5338 /* AllowAlloca */ true,
5339 /* AllocationBlock */ CLI->getPreheader(),
5340 /* Suffix */ ".omp_wsloop",
5341 /* AggrArgsIn0AddrSpace */ true);
5342
5343 BasicBlock *CommonExit = nullptr;
5344 SetVector<Value *> SinkingCands, HoistingCands;
5345
5346 // Find allocas outside the loop body region which are used inside loop
5347 // body
5348 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5349
5350 // We need to model loop body region as the function f(cnt, loop_arg).
5351 // That's why we replace loop induction variable by the new counter
5352 // which will be one of loop body function argument
5353 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5354 CLI->getIndVar()->user_end());
5355 for (auto Use : Users) {
5356 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5357 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5358 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5359 }
5360 }
5361 }
5362 // Make sure that loop counter variable is not merged into loop body
5363 // function argument structure and it is passed as separate variable
5364 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5365
5366 // PostOutline CB is invoked when loop body function is outlined and
5367 // loop body is replaced by call to outlined function. We need to add
5368 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5369 // function will handle loop control logic.
5370 //
5371 OI.PostOutlineCB = [=, ToBeDeletedVec =
5372 std::move(ToBeDeleted)](Function &OutlinedFn) {
5373 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5374 LoopType, NoLoop);
5375 };
5376 addOutlineInfo(std::move(OI));
5377 return CLI->getAfterIP();
5378}
5379
5380OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5381 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5382 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5383 bool HasSimdModifier, bool HasMonotonicModifier,
5384 bool HasNonmonotonicModifier, bool HasOrderedClause,
5385 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5386 Value *DistScheduleChunkSize) {
5387 if (Config.isTargetDevice())
5388 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5389 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5390 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5391 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
5392
5393 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5394 OMPScheduleType::ModifierOrdered;
5395 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
5396 if (HasDistSchedule) {
5397 DistScheduleSchedType = DistScheduleChunkSize
5398 ? OMPScheduleType::OrderedDistributeChunked
5399 : OMPScheduleType::OrderedDistribute;
5400 }
5401 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5402 case OMPScheduleType::BaseStatic:
5403 case OMPScheduleType::BaseDistribute:
5404 assert(!ChunkSize || !DistScheduleChunkSize &&
5405 "No chunk size with static-chunked schedule");
5406 if (IsOrdered && !HasDistSchedule)
5407 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5408 NeedsBarrier, ChunkSize);
5409 // FIXME: Monotonicity ignored?
5410 if (DistScheduleChunkSize)
5411 return applyStaticChunkedWorkshareLoop(
5412 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5413 DistScheduleChunkSize, DistScheduleSchedType);
5414 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
5415 HasDistSchedule);
5416
5417 case OMPScheduleType::BaseStaticChunked:
5418 case OMPScheduleType::BaseDistributeChunked:
5419 if (IsOrdered && !HasDistSchedule)
5420 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5421 NeedsBarrier, ChunkSize);
5422 // FIXME: Monotonicity ignored?
5423 return applyStaticChunkedWorkshareLoop(
5424 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5425 DistScheduleChunkSize, DistScheduleSchedType);
5426
5427 case OMPScheduleType::BaseRuntime:
5428 case OMPScheduleType::BaseAuto:
5429 case OMPScheduleType::BaseGreedy:
5430 case OMPScheduleType::BaseBalanced:
5431 case OMPScheduleType::BaseSteal:
5432 case OMPScheduleType::BaseGuidedSimd:
5433 case OMPScheduleType::BaseRuntimeSimd:
5434 assert(!ChunkSize &&
5435 "schedule type does not support user-defined chunk sizes");
5436 [[fallthrough]];
5437 case OMPScheduleType::BaseDynamicChunked:
5438 case OMPScheduleType::BaseGuidedChunked:
5439 case OMPScheduleType::BaseGuidedIterativeChunked:
5440 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5441 case OMPScheduleType::BaseStaticBalancedChunked:
5442 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5443 NeedsBarrier, ChunkSize);
5444
5445 default:
5446 llvm_unreachable("Unknown/unimplemented schedule kind");
5447 }
5448}
5449
5450/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5451/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5452/// the runtime. Always interpret integers as unsigned similarly to
5453/// CanonicalLoopInfo.
5454static FunctionCallee
5455getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5456 unsigned Bitwidth = Ty->getIntegerBitWidth();
5457 if (Bitwidth == 32)
5458 return OMPBuilder.getOrCreateRuntimeFunction(
5459 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5460 if (Bitwidth == 64)
5461 return OMPBuilder.getOrCreateRuntimeFunction(
5462 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5463 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5464}
5465
5466/// Returns an LLVM function to call for updating the next loop using OpenMP
5467/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5468/// the runtime. Always interpret integers as unsigned similarly to
5469/// CanonicalLoopInfo.
5470static FunctionCallee
5471getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5472 unsigned Bitwidth = Ty->getIntegerBitWidth();
5473 if (Bitwidth == 32)
5474 return OMPBuilder.getOrCreateRuntimeFunction(
5475 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5476 if (Bitwidth == 64)
5477 return OMPBuilder.getOrCreateRuntimeFunction(
5478 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5479 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5480}
5481
5482/// Returns an LLVM function to call for finalizing the dynamic loop using
5483/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5484/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5485static FunctionCallee
5486getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5487 unsigned Bitwidth = Ty->getIntegerBitWidth();
5488 if (Bitwidth == 32)
5489 return OMPBuilder.getOrCreateRuntimeFunction(
5490 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5491 if (Bitwidth == 64)
5492 return OMPBuilder.getOrCreateRuntimeFunction(
5493 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5494 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5495}
5496
5497OpenMPIRBuilder::InsertPointOrErrorTy
5498OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5499 InsertPointTy AllocaIP,
5500 OMPScheduleType SchedType,
5501 bool NeedsBarrier, Value *Chunk) {
5502 assert(CLI->isValid() && "Requires a valid canonical loop");
5503 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5504 "Require dedicated allocate IP");
5506 "Require valid schedule type");
5507
5508 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5509 OMPScheduleType::ModifierOrdered;
5510
5511 // Set up the source location value for OpenMP runtime.
5512 Builder.SetCurrentDebugLocation(DL);
5513
5514 uint32_t SrcLocStrSize;
5515 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5516 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5517
5518 // Declare useful OpenMP runtime functions.
5519 Value *IV = CLI->getIndVar();
5520 Type *IVTy = IV->getType();
5521 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5522 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5523
5524 // Allocate space for computed loop bounds as expected by the "init" function.
5525 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5526 Type *I32Type = Type::getInt32Ty(M.getContext());
5527 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5528 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5529 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5530 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5531 CLI->setLastIter(PLastIter);
5532
5533 // At the end of the preheader, prepare for calling the "init" function by
5534 // storing the current loop bounds into the allocated space. A canonical loop
5535 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5536 // and produces an inclusive upper bound.
5537 BasicBlock *PreHeader = CLI->getPreheader();
5538 Builder.SetInsertPoint(PreHeader->getTerminator());
5539 Constant *One = ConstantInt::get(IVTy, 1);
5540 Builder.CreateStore(One, PLowerBound);
5541 Value *UpperBound = CLI->getTripCount();
5542 Builder.CreateStore(UpperBound, PUpperBound);
5543 Builder.CreateStore(One, PStride);
5544
5545 BasicBlock *Header = CLI->getHeader();
5546 BasicBlock *Exit = CLI->getExit();
5547 BasicBlock *Cond = CLI->getCond();
5548 BasicBlock *Latch = CLI->getLatch();
5549 InsertPointTy AfterIP = CLI->getAfterIP();
5550
5551 // The CLI will be "broken" in the code below, as the loop is no longer
5552 // a valid canonical loop.
5553
5554 if (!Chunk)
5555 Chunk = One;
5556
5557 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5558
5559 Constant *SchedulingType =
5560 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5561
5562 // Call the "init" function.
5563 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
5564 /* LowerBound */ One, UpperBound,
5565 /* step */ One, Chunk});
5566
5567 // An outer loop around the existing one.
5568 BasicBlock *OuterCond = BasicBlock::Create(
5569 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5570 PreHeader->getParent());
5571 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5572 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5573 Value *Res = createRuntimeFunctionCall(
5574 DynamicNext,
5575 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
5576 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5577 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5578 Value *LowerBound =
5579 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5580 Builder.CreateCondBr(MoreWork, Header, Exit);
5581
5582 // Change PHI-node in loop header to use outer cond rather than preheader,
5583 // and set IV to the LowerBound.
5584 Instruction *Phi = &Header->front();
5585 auto *PI = cast<PHINode>(Phi);
5586 PI->setIncomingBlock(0, OuterCond);
5587 PI->setIncomingValue(0, LowerBound);
5588
5589 // Then set the pre-header to jump to the OuterCond
5590 Instruction *Term = PreHeader->getTerminator();
5591 auto *Br = cast<BranchInst>(Term);
5592 Br->setSuccessor(0, OuterCond);
5593
5594 // Modify the inner condition:
5595 // * Use the UpperBound returned from the DynamicNext call.
5596 // * jump to the loop outer loop when done with one of the inner loops.
5597 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5598 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5599 Instruction *Comp = &*Builder.GetInsertPoint();
5600 auto *CI = cast<CmpInst>(Comp);
5601 CI->setOperand(1, UpperBound);
5602 // Redirect the inner exit to branch to outer condition.
5603 Instruction *Branch = &Cond->back();
5604 auto *BI = cast<BranchInst>(Branch);
5605 assert(BI->getSuccessor(1) == Exit);
5606 BI->setSuccessor(1, OuterCond);
5607
5608 // Call the "fini" function if "ordered" is present in wsloop directive.
5609 if (Ordered) {
5610 Builder.SetInsertPoint(&Latch->back());
5611 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5612 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
5613 }
5614
5615 // Add the barrier if requested.
5616 if (NeedsBarrier) {
5617 Builder.SetInsertPoint(&Exit->back());
5618 InsertPointOrErrorTy BarrierIP =
5619 createBarrier(LocationDescription(Builder.saveIP(), DL),
5620 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5621 /* CheckCancelFlag */ false);
5622 if (!BarrierIP)
5623 return BarrierIP.takeError();
5624 }
5625
5626 CLI->invalidate();
5627 return AfterIP;
5628}
5629
5630/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5631/// after this \p OldTarget will be orphaned.
5633 BasicBlock *NewTarget, DebugLoc DL) {
5634 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5635 redirectTo(Pred, NewTarget, DL);
5636}
5637
5638/// Determine which blocks in \p BBs are reachable from outside and remove the
5639/// ones that are not reachable from the function.
5642 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5643 for (Use &U : BB->uses()) {
5644 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5645 if (!UseInst)
5646 continue;
5647 if (BBsToErase.count(UseInst->getParent()))
5648 continue;
5649 return true;
5650 }
5651 return false;
5652 };
5653
5654 while (BBsToErase.remove_if(HasRemainingUses)) {
5655 // Try again if anything was removed.
5656 }
5657
5658 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5659 DeleteDeadBlocks(BBVec);
5660}
5661
5662CanonicalLoopInfo *
5663OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5664 InsertPointTy ComputeIP) {
5665 assert(Loops.size() >= 1 && "At least one loop required");
5666 size_t NumLoops = Loops.size();
5667
5668 // Nothing to do if there is already just one loop.
5669 if (NumLoops == 1)
5670 return Loops.front();
5671
5672 CanonicalLoopInfo *Outermost = Loops.front();
5673 CanonicalLoopInfo *Innermost = Loops.back();
5674 BasicBlock *OrigPreheader = Outermost->getPreheader();
5675 BasicBlock *OrigAfter = Outermost->getAfter();
5676 Function *F = OrigPreheader->getParent();
5677
5678 // Loop control blocks that may become orphaned later.
5679 SmallVector<BasicBlock *, 12> OldControlBBs;
5680 OldControlBBs.reserve(6 * Loops.size());
5681 for (CanonicalLoopInfo *Loop : Loops)
5682 Loop->collectControlBlocks(OldControlBBs);
5683
5684 // Setup the IRBuilder for inserting the trip count computation.
5685 Builder.SetCurrentDebugLocation(DL);
5686 if (ComputeIP.isSet())
5687 Builder.restoreIP(ComputeIP);
5688 else
5689 Builder.restoreIP(Outermost->getPreheaderIP());
5690
5691 // Derive the collapsed' loop trip count.
5692 // TODO: Find common/largest indvar type.
5693 Value *CollapsedTripCount = nullptr;
5694 for (CanonicalLoopInfo *L : Loops) {
5695 assert(L->isValid() &&
5696 "All loops to collapse must be valid canonical loops");
5697 Value *OrigTripCount = L->getTripCount();
5698 if (!CollapsedTripCount) {
5699 CollapsedTripCount = OrigTripCount;
5700 continue;
5701 }
5702
5703 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5704 CollapsedTripCount =
5705 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5706 }
5707
5708 // Create the collapsed loop control flow.
5709 CanonicalLoopInfo *Result =
5710 createLoopSkeleton(DL, CollapsedTripCount, F,
5711 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5712
5713 // Build the collapsed loop body code.
5714 // Start with deriving the input loop induction variables from the collapsed
5715 // one, using a divmod scheme. To preserve the original loops' order, the
5716 // innermost loop use the least significant bits.
5717 Builder.restoreIP(Result->getBodyIP());
5718
5719 Value *Leftover = Result->getIndVar();
5720 SmallVector<Value *> NewIndVars;
5721 NewIndVars.resize(NumLoops);
5722 for (int i = NumLoops - 1; i >= 1; --i) {
5723 Value *OrigTripCount = Loops[i]->getTripCount();
5724
5725 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5726 NewIndVars[i] = NewIndVar;
5727
5728 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5729 }
5730 // Outermost loop gets all the remaining bits.
5731 NewIndVars[0] = Leftover;
5732
5733 // Construct the loop body control flow.
5734 // We progressively construct the branch structure following in direction of
5735 // the control flow, from the leading in-between code, the loop nest body, the
5736 // trailing in-between code, and rejoining the collapsed loop's latch.
5737 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5738 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5739 // its predecessors as sources.
5740 BasicBlock *ContinueBlock = Result->getBody();
5741 BasicBlock *ContinuePred = nullptr;
5742 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5743 BasicBlock *NextSrc) {
5744 if (ContinueBlock)
5745 redirectTo(ContinueBlock, Dest, DL);
5746 else
5747 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5748
5749 ContinueBlock = nullptr;
5750 ContinuePred = NextSrc;
5751 };
5752
5753 // The code before the nested loop of each level.
5754 // Because we are sinking it into the nest, it will be executed more often
5755 // that the original loop. More sophisticated schemes could keep track of what
5756 // the in-between code is and instantiate it only once per thread.
5757 for (size_t i = 0; i < NumLoops - 1; ++i)
5758 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5759
5760 // Connect the loop nest body.
5761 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5762
5763 // The code after the nested loop at each level.
5764 for (size_t i = NumLoops - 1; i > 0; --i)
5765 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5766
5767 // Connect the finished loop to the collapsed loop latch.
5768 ContinueWith(Result->getLatch(), nullptr);
5769
5770 // Replace the input loops with the new collapsed loop.
5771 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5772 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5773
5774 // Replace the input loop indvars with the derived ones.
5775 for (size_t i = 0; i < NumLoops; ++i)
5776 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5777
5778 // Remove unused parts of the input loops.
5779 removeUnusedBlocksFromParent(OldControlBBs);
5780
5781 for (CanonicalLoopInfo *L : Loops)
5782 L->invalidate();
5783
5784#ifndef NDEBUG
5785 Result->assertOK();
5786#endif
5787 return Result;
5788}
5789
5790std::vector<CanonicalLoopInfo *>
5791OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5792 ArrayRef<Value *> TileSizes) {
5793 assert(TileSizes.size() == Loops.size() &&
5794 "Must pass as many tile sizes as there are loops");
5795 int NumLoops = Loops.size();
5796 assert(NumLoops >= 1 && "At least one loop to tile required");
5797
5798 CanonicalLoopInfo *OutermostLoop = Loops.front();
5799 CanonicalLoopInfo *InnermostLoop = Loops.back();
5800 Function *F = OutermostLoop->getBody()->getParent();
5801 BasicBlock *InnerEnter = InnermostLoop->getBody();
5802 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5803
5804 // Loop control blocks that may become orphaned later.
5805 SmallVector<BasicBlock *, 12> OldControlBBs;
5806 OldControlBBs.reserve(6 * Loops.size());
5807 for (CanonicalLoopInfo *Loop : Loops)
5808 Loop->collectControlBlocks(OldControlBBs);
5809
5810 // Collect original trip counts and induction variable to be accessible by
5811 // index. Also, the structure of the original loops is not preserved during
5812 // the construction of the tiled loops, so do it before we scavenge the BBs of
5813 // any original CanonicalLoopInfo.
5814 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5815 for (CanonicalLoopInfo *L : Loops) {
5816 assert(L->isValid() && "All input loops must be valid canonical loops");
5817 OrigTripCounts.push_back(L->getTripCount());
5818 OrigIndVars.push_back(L->getIndVar());
5819 }
5820
5821 // Collect the code between loop headers. These may contain SSA definitions
5822 // that are used in the loop nest body. To be usable with in the innermost
5823 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5824 // these instructions may be executed more often than before the tiling.
5825 // TODO: It would be sufficient to only sink them into body of the
5826 // corresponding tile loop.
5828 for (int i = 0; i < NumLoops - 1; ++i) {
5829 CanonicalLoopInfo *Surrounding = Loops[i];
5830 CanonicalLoopInfo *Nested = Loops[i + 1];
5831
5832 BasicBlock *EnterBB = Surrounding->getBody();
5833 BasicBlock *ExitBB = Nested->getHeader();
5834 InbetweenCode.emplace_back(EnterBB, ExitBB);
5835 }
5836
5837 // Compute the trip counts of the floor loops.
5838 Builder.SetCurrentDebugLocation(DL);
5839 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5840 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5841 for (int i = 0; i < NumLoops; ++i) {
5842 Value *TileSize = TileSizes[i];
5843 Value *OrigTripCount = OrigTripCounts[i];
5844 Type *IVType = OrigTripCount->getType();
5845
5846 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5847 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5848
5849 // 0 if tripcount divides the tilesize, 1 otherwise.
5850 // 1 means we need an additional iteration for a partial tile.
5851 //
5852 // Unfortunately we cannot just use the roundup-formula
5853 // (tripcount + tilesize - 1)/tilesize
5854 // because the summation might overflow. We do not want introduce undefined
5855 // behavior when the untiled loop nest did not.
5856 Value *FloorTripOverflow =
5857 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5858
5859 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5860 Value *FloorTripCount =
5861 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5862 "omp_floor" + Twine(i) + ".tripcount", true);
5863
5864 // Remember some values for later use.
5865 FloorCompleteCount.push_back(FloorCompleteTripCount);
5866 FloorCount.push_back(FloorTripCount);
5867 FloorRems.push_back(FloorTripRem);
5868 }
5869
5870 // Generate the new loop nest, from the outermost to the innermost.
5871 std::vector<CanonicalLoopInfo *> Result;
5872 Result.reserve(NumLoops * 2);
5873
5874 // The basic block of the surrounding loop that enters the nest generated
5875 // loop.
5876 BasicBlock *Enter = OutermostLoop->getPreheader();
5877
5878 // The basic block of the surrounding loop where the inner code should
5879 // continue.
5880 BasicBlock *Continue = OutermostLoop->getAfter();
5881
5882 // Where the next loop basic block should be inserted.
5883 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5884
5885 auto EmbeddNewLoop =
5886 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5887 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5888 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5889 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5890 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5891 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5892
5893 // Setup the position where the next embedded loop connects to this loop.
5894 Enter = EmbeddedLoop->getBody();
5895 Continue = EmbeddedLoop->getLatch();
5896 OutroInsertBefore = EmbeddedLoop->getLatch();
5897 return EmbeddedLoop;
5898 };
5899
5900 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5901 const Twine &NameBase) {
5902 for (auto P : enumerate(TripCounts)) {
5903 CanonicalLoopInfo *EmbeddedLoop =
5904 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5905 Result.push_back(EmbeddedLoop);
5906 }
5907 };
5908
5909 EmbeddNewLoops(FloorCount, "floor");
5910
5911 // Within the innermost floor loop, emit the code that computes the tile
5912 // sizes.
5913 Builder.SetInsertPoint(Enter->getTerminator());
5914 SmallVector<Value *, 4> TileCounts;
5915 for (int i = 0; i < NumLoops; ++i) {
5916 CanonicalLoopInfo *FloorLoop = Result[i];
5917 Value *TileSize = TileSizes[i];
5918
5919 Value *FloorIsEpilogue =
5920 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5921 Value *TileTripCount =
5922 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5923
5924 TileCounts.push_back(TileTripCount);
5925 }
5926
5927 // Create the tile loops.
5928 EmbeddNewLoops(TileCounts, "tile");
5929
5930 // Insert the inbetween code into the body.
5931 BasicBlock *BodyEnter = Enter;
5932 BasicBlock *BodyEntered = nullptr;
5933 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5934 BasicBlock *EnterBB = P.first;
5935 BasicBlock *ExitBB = P.second;
5936
5937 if (BodyEnter)
5938 redirectTo(BodyEnter, EnterBB, DL);
5939 else
5940 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5941
5942 BodyEnter = nullptr;
5943 BodyEntered = ExitBB;
5944 }
5945
5946 // Append the original loop nest body into the generated loop nest body.
5947 if (BodyEnter)
5948 redirectTo(BodyEnter, InnerEnter, DL);
5949 else
5950 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5952
5953 // Replace the original induction variable with an induction variable computed
5954 // from the tile and floor induction variables.
5955 Builder.restoreIP(Result.back()->getBodyIP());
5956 for (int i = 0; i < NumLoops; ++i) {
5957 CanonicalLoopInfo *FloorLoop = Result[i];
5958 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5959 Value *OrigIndVar = OrigIndVars[i];
5960 Value *Size = TileSizes[i];
5961
5962 Value *Scale =
5963 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5964 Value *Shift =
5965 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5966 OrigIndVar->replaceAllUsesWith(Shift);
5967 }
5968
5969 // Remove unused parts of the original loops.
5970 removeUnusedBlocksFromParent(OldControlBBs);
5971
5972 for (CanonicalLoopInfo *L : Loops)
5973 L->invalidate();
5974
5975#ifndef NDEBUG
5976 for (CanonicalLoopInfo *GenL : Result)
5977 GenL->assertOK();
5978#endif
5979 return Result;
5980}
5981
5982/// Attach metadata \p Properties to the basic block described by \p BB. If the
5983/// basic block already has metadata, the basic block properties are appended.
5985 ArrayRef<Metadata *> Properties) {
5986 // Nothing to do if no property to attach.
5987 if (Properties.empty())
5988 return;
5989
5990 LLVMContext &Ctx = BB->getContext();
5991 SmallVector<Metadata *> NewProperties;
5992 NewProperties.push_back(nullptr);
5993
5994 // If the basic block already has metadata, prepend it to the new metadata.
5995 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5996 if (Existing)
5997 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5998
5999 append_range(NewProperties, Properties);
6000 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6001 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6002
6003 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6004}
6005
6006/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6007/// loop already has metadata, the loop properties are appended.
6008static void addLoopMetadata(CanonicalLoopInfo *Loop,
6009 ArrayRef<Metadata *> Properties) {
6010 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6011
6012 // Attach metadata to the loop's latch
6013 BasicBlock *Latch = Loop->getLatch();
6014 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6015 addBasicBlockMetadata(Latch, Properties);
6016}
6017
6018/// Attach llvm.access.group metadata to the memref instructions of \p Block
6020 LoopInfo &LI) {
6021 for (Instruction &I : *Block) {
6022 if (I.mayReadOrWriteMemory()) {
6023 // TODO: This instruction may already have access group from
6024 // other pragmas e.g. #pragma clang loop vectorize. Append
6025 // so that the existing metadata is not overwritten.
6026 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6027 }
6028 }
6029}
6030
6031void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
6032 LLVMContext &Ctx = Builder.getContext();
6034 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6035 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6036}
6037
6038void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
6039 LLVMContext &Ctx = Builder.getContext();
6041 Loop, {
6042 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6043 });
6044}
6045
6046void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6047 Value *IfCond, ValueToValueMapTy &VMap,
6048 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6049 const Twine &NamePrefix) {
6050 Function *F = CanonicalLoop->getFunction();
6051
6052 // We can't do
6053 // if (cond) {
6054 // simd_loop;
6055 // } else {
6056 // non_simd_loop;
6057 // }
6058 // because then the CanonicalLoopInfo would only point to one of the loops:
6059 // leading to other constructs operating on the same loop to malfunction.
6060 // Instead generate
6061 // while (...) {
6062 // if (cond) {
6063 // simd_body;
6064 // } else {
6065 // not_simd_body;
6066 // }
6067 // }
6068 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6069 // body at -O3
6070
6071 // Define where if branch should be inserted
6072 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6073
6074 // Create additional blocks for the if statement
6075 BasicBlock *Cond = SplitBeforeIt->getParent();
6076 llvm::LLVMContext &C = Cond->getContext();
6078 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6080 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6081
6082 // Create if condition branch.
6083 Builder.SetInsertPoint(SplitBeforeIt);
6084 Instruction *BrInstr =
6085 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6086 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6087 // Then block contains branch to omp loop body which needs to be vectorized
6088 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6089 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6090
6091 Builder.SetInsertPoint(ElseBlock);
6092
6093 // Clone loop for the else branch
6095
6096 SmallVector<BasicBlock *, 8> ExistingBlocks;
6097 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6098 ExistingBlocks.push_back(ThenBlock);
6099 ExistingBlocks.append(L->block_begin(), L->block_end());
6100 // Cond is the block that has the if clause condition
6101 // LoopCond is omp_loop.cond
6102 // LoopHeader is omp_loop.header
6103 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6104 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6105 assert(LoopCond && LoopHeader && "Invalid loop structure");
6106 for (BasicBlock *Block : ExistingBlocks) {
6107 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6108 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6109 continue;
6110 }
6111 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6112
6113 // fix name not to be omp.if.then
6114 if (Block == ThenBlock)
6115 NewBB->setName(NamePrefix + ".if.else");
6116
6117 NewBB->moveBefore(CanonicalLoop->getExit());
6118 VMap[Block] = NewBB;
6119 NewBlocks.push_back(NewBB);
6120 }
6121 remapInstructionsInBlocks(NewBlocks, VMap);
6122 Builder.CreateBr(NewBlocks.front());
6123
6124 // The loop latch must have only one predecessor. Currently it is branched to
6125 // from both the 'then' and 'else' branches.
6126 L->getLoopLatch()->splitBasicBlock(
6127 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
6128
6129 // Ensure that the then block is added to the loop so we add the attributes in
6130 // the next step
6131 L->addBasicBlockToLoop(ThenBlock, LI);
6132}
6133
6134unsigned
6135OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
6136 const StringMap<bool> &Features) {
6137 if (TargetTriple.isX86()) {
6138 if (Features.lookup("avx512f"))
6139 return 512;
6140 else if (Features.lookup("avx"))
6141 return 256;
6142 return 128;
6143 }
6144 if (TargetTriple.isPPC())
6145 return 128;
6146 if (TargetTriple.isWasm())
6147 return 128;
6148 return 0;
6149}
6150
6151void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
6152 MapVector<Value *, Value *> AlignedVars,
6153 Value *IfCond, OrderKind Order,
6154 ConstantInt *Simdlen, ConstantInt *Safelen) {
6155 LLVMContext &Ctx = Builder.getContext();
6156
6157 Function *F = CanonicalLoop->getFunction();
6158
6159 // TODO: We should not rely on pass manager. Currently we use pass manager
6160 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6161 // object. We should have a method which returns all blocks between
6162 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6164 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6165 FAM.registerPass([]() { return LoopAnalysis(); });
6166 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6167
6168 LoopAnalysis LIA;
6169 LoopInfo &&LI = LIA.run(*F, FAM);
6170
6171 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6172 if (AlignedVars.size()) {
6173 InsertPointTy IP = Builder.saveIP();
6174 for (auto &AlignedItem : AlignedVars) {
6175 Value *AlignedPtr = AlignedItem.first;
6176 Value *Alignment = AlignedItem.second;
6177 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6178 Builder.SetInsertPoint(loadInst->getNextNode());
6179 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6180 Alignment);
6181 }
6182 Builder.restoreIP(IP);
6183 }
6184
6185 if (IfCond) {
6186 ValueToValueMapTy VMap;
6187 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6188 }
6189
6191
6192 // Get the basic blocks from the loop in which memref instructions
6193 // can be found.
6194 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6195 // preferably without running any passes.
6196 for (BasicBlock *Block : L->getBlocks()) {
6197 if (Block == CanonicalLoop->getCond() ||
6198 Block == CanonicalLoop->getHeader())
6199 continue;
6200 Reachable.insert(Block);
6201 }
6202
6203 SmallVector<Metadata *> LoopMDList;
6204
6205 // In presence of finite 'safelen', it may be unsafe to mark all
6206 // the memory instructions parallel, because loop-carried
6207 // dependences of 'safelen' iterations are possible.
6208 // If clause order(concurrent) is specified then the memory instructions
6209 // are marked parallel even if 'safelen' is finite.
6210 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6211 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6212
6213 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6214 // versions so we can't add the loop attributes in that case.
6215 if (IfCond) {
6216 // we can still add llvm.loop.parallel_access
6217 addLoopMetadata(CanonicalLoop, LoopMDList);
6218 return;
6219 }
6220
6221 // Use the above access group metadata to create loop level
6222 // metadata, which should be distinct for each loop.
6223 ConstantAsMetadata *BoolConst =
6225 LoopMDList.push_back(MDNode::get(
6226 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6227
6228 if (Simdlen || Safelen) {
6229 // If both simdlen and safelen clauses are specified, the value of the
6230 // simdlen parameter must be less than or equal to the value of the safelen
6231 // parameter. Therefore, use safelen only in the absence of simdlen.
6232 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6233 LoopMDList.push_back(
6234 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6235 ConstantAsMetadata::get(VectorizeWidth)}));
6236 }
6237
6238 addLoopMetadata(CanonicalLoop, LoopMDList);
6239}
6240
6241/// Create the TargetMachine object to query the backend for optimization
6242/// preferences.
6243///
6244/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6245/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6246/// needed for the LLVM pass pipline. We use some default options to avoid
6247/// having to pass too many settings from the frontend that probably do not
6248/// matter.
6249///
6250/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6251/// method. If we are going to use TargetMachine for more purposes, especially
6252/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6253/// might become be worth requiring front-ends to pass on their TargetMachine,
6254/// or at least cache it between methods. Note that while fontends such as Clang
6255/// have just a single main TargetMachine per translation unit, "target-cpu" and
6256/// "target-features" that determine the TargetMachine are per-function and can
6257/// be overrided using __attribute__((target("OPTIONS"))).
6258static std::unique_ptr<TargetMachine>
6260 Module *M = F->getParent();
6261
6262 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6263 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6264 const llvm::Triple &Triple = M->getTargetTriple();
6265
6266 std::string Error;
6268 if (!TheTarget)
6269 return {};
6270
6272 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6273 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6274 /*CodeModel=*/std::nullopt, OptLevel));
6275}
6276
6277/// Heuristically determine the best-performant unroll factor for \p CLI. This
6278/// depends on the target processor. We are re-using the same heuristics as the
6279/// LoopUnrollPass.
6280static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6281 Function *F = CLI->getFunction();
6282
6283 // Assume the user requests the most aggressive unrolling, even if the rest of
6284 // the code is optimized using a lower setting.
6286 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6287
6289 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6290 FAM.registerPass([]() { return AssumptionAnalysis(); });
6291 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6292 FAM.registerPass([]() { return LoopAnalysis(); });
6293 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6294 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6295 TargetIRAnalysis TIRA;
6296 if (TM)
6297 TIRA = TargetIRAnalysis(
6298 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6299 FAM.registerPass([&]() { return TIRA; });
6300
6301 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6303 ScalarEvolution &&SE = SEA.run(*F, FAM);
6305 DominatorTree &&DT = DTA.run(*F, FAM);
6306 LoopAnalysis LIA;
6307 LoopInfo &&LI = LIA.run(*F, FAM);
6309 AssumptionCache &&AC = ACT.run(*F, FAM);
6311
6312 Loop *L = LI.getLoopFor(CLI->getHeader());
6313 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6314
6316 L, SE, TTI,
6317 /*BlockFrequencyInfo=*/nullptr,
6318 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6319 /*UserThreshold=*/std::nullopt,
6320 /*UserCount=*/std::nullopt,
6321 /*UserAllowPartial=*/true,
6322 /*UserAllowRuntime=*/true,
6323 /*UserUpperBound=*/std::nullopt,
6324 /*UserFullUnrollMaxCount=*/std::nullopt);
6325
6326 UP.Force = true;
6327
6328 // Account for additional optimizations taking place before the LoopUnrollPass
6329 // would unroll the loop.
6332
6333 // Use normal unroll factors even if the rest of the code is optimized for
6334 // size.
6337
6338 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6339 << " Threshold=" << UP.Threshold << "\n"
6340 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6341 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6342 << " PartialOptSizeThreshold="
6343 << UP.PartialOptSizeThreshold << "\n");
6344
6345 // Disable peeling.
6348 /*UserAllowPeeling=*/false,
6349 /*UserAllowProfileBasedPeeling=*/false,
6350 /*UnrollingSpecficValues=*/false);
6351
6353 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6354
6355 // Assume that reads and writes to stack variables can be eliminated by
6356 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6357 // size.
6358 for (BasicBlock *BB : L->blocks()) {
6359 for (Instruction &I : *BB) {
6360 Value *Ptr;
6361 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6362 Ptr = Load->getPointerOperand();
6363 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6364 Ptr = Store->getPointerOperand();
6365 } else
6366 continue;
6367
6368 Ptr = Ptr->stripPointerCasts();
6369
6370 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6371 if (Alloca->getParent() == &F->getEntryBlock())
6372 EphValues.insert(&I);
6373 }
6374 }
6375 }
6376
6377 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6378
6379 // Loop is not unrollable if the loop contains certain instructions.
6380 if (!UCE.canUnroll()) {
6381 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6382 return 1;
6383 }
6384
6385 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6386 << "\n");
6387
6388 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6389 // be able to use it.
6390 int TripCount = 0;
6391 int MaxTripCount = 0;
6392 bool MaxOrZero = false;
6393 unsigned TripMultiple = 0;
6394
6395 bool UseUpperBound = false;
6396 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6397 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6398 UseUpperBound);
6399 unsigned Factor = UP.Count;
6400 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6401
6402 // This function returns 1 to signal to not unroll a loop.
6403 if (Factor == 0)
6404 return 1;
6405 return Factor;
6406}
6407
6408void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6409 int32_t Factor,
6410 CanonicalLoopInfo **UnrolledCLI) {
6411 assert(Factor >= 0 && "Unroll factor must not be negative");
6412
6413 Function *F = Loop->getFunction();
6414 LLVMContext &Ctx = F->getContext();
6415
6416 // If the unrolled loop is not used for another loop-associated directive, it
6417 // is sufficient to add metadata for the LoopUnrollPass.
6418 if (!UnrolledCLI) {
6419 SmallVector<Metadata *, 2> LoopMetadata;
6420 LoopMetadata.push_back(
6421 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6422
6423 if (Factor >= 1) {
6425 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6426 LoopMetadata.push_back(MDNode::get(
6427 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6428 }
6429
6430 addLoopMetadata(Loop, LoopMetadata);
6431 return;
6432 }
6433
6434 // Heuristically determine the unroll factor.
6435 if (Factor == 0)
6437
6438 // No change required with unroll factor 1.
6439 if (Factor == 1) {
6440 *UnrolledCLI = Loop;
6441 return;
6442 }
6443
6444 assert(Factor >= 2 &&
6445 "unrolling only makes sense with a factor of 2 or larger");
6446
6447 Type *IndVarTy = Loop->getIndVarType();
6448
6449 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6450 // unroll the inner loop.
6451 Value *FactorVal =
6452 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6453 /*isSigned=*/false));
6454 std::vector<CanonicalLoopInfo *> LoopNest =
6455 tileLoops(DL, {Loop}, {FactorVal});
6456 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6457 *UnrolledCLI = LoopNest[0];
6458 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6459
6460 // LoopUnrollPass can only fully unroll loops with constant trip count.
6461 // Unroll by the unroll factor with a fallback epilog for the remainder
6462 // iterations if necessary.
6464 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6466 InnerLoop,
6467 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6469 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6470
6471#ifndef NDEBUG
6472 (*UnrolledCLI)->assertOK();
6473#endif
6474}
6475
6476OpenMPIRBuilder::InsertPointTy
6477OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6478 llvm::Value *BufSize, llvm::Value *CpyBuf,
6479 llvm::Value *CpyFn, llvm::Value *DidIt) {
6480 if (!updateToLocation(Loc))
6481 return Loc.IP;
6482
6483 uint32_t SrcLocStrSize;
6484 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6485 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6486 Value *ThreadId = getOrCreateThreadID(Ident);
6487
6488 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6489
6490 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6491
6492 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6493 createRuntimeFunctionCall(Fn, Args);
6494
6495 return Builder.saveIP();
6496}
6497
6498OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6499 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6500 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6502
6503 if (!updateToLocation(Loc))
6504 return Loc.IP;
6505
6506 // If needed allocate and initialize `DidIt` with 0.
6507 // DidIt: flag variable: 1=single thread; 0=not single thread.
6508 llvm::Value *DidIt = nullptr;
6509 if (!CPVars.empty()) {
6510 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6511 Builder.CreateStore(Builder.getInt32(0), DidIt);
6512 }
6513
6514 Directive OMPD = Directive::OMPD_single;
6515 uint32_t SrcLocStrSize;
6516 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6517 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6518 Value *ThreadId = getOrCreateThreadID(Ident);
6519 Value *Args[] = {Ident, ThreadId};
6520
6521 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6522 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
6523
6524 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6525 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6526
6527 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6528 if (Error Err = FiniCB(IP))
6529 return Err;
6530
6531 // The thread that executes the single region must set `DidIt` to 1.
6532 // This is used by __kmpc_copyprivate, to know if the caller is the
6533 // single thread or not.
6534 if (DidIt)
6535 Builder.CreateStore(Builder.getInt32(1), DidIt);
6536
6537 return Error::success();
6538 };
6539
6540 // generates the following:
6541 // if (__kmpc_single()) {
6542 // .... single region ...
6543 // __kmpc_end_single
6544 // }
6545 // __kmpc_copyprivate
6546 // __kmpc_barrier
6547
6548 InsertPointOrErrorTy AfterIP =
6549 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6550 /*Conditional*/ true,
6551 /*hasFinalize*/ true);
6552 if (!AfterIP)
6553 return AfterIP.takeError();
6554
6555 if (DidIt) {
6556 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6557 // NOTE BufSize is currently unused, so just pass 0.
6558 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6559 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6560 CPFuncs[I], DidIt);
6561 // NOTE __kmpc_copyprivate already inserts a barrier
6562 } else if (!IsNowait) {
6563 InsertPointOrErrorTy AfterIP =
6564 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6565 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6566 /* CheckCancelFlag */ false);
6567 if (!AfterIP)
6568 return AfterIP.takeError();
6569 }
6570 return Builder.saveIP();
6571}
6572
6573OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6574 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6575 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6576
6577 if (!updateToLocation(Loc))
6578 return Loc.IP;
6579
6580 Directive OMPD = Directive::OMPD_critical;
6581 uint32_t SrcLocStrSize;
6582 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6583 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6584 Value *ThreadId = getOrCreateThreadID(Ident);
6585 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6586 Value *Args[] = {Ident, ThreadId, LockVar};
6587
6588 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6589 Function *RTFn = nullptr;
6590 if (HintInst) {
6591 // Add Hint to entry Args and create call
6592 EnterArgs.push_back(HintInst);
6593 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6594 } else {
6595 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6596 }
6597 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
6598
6599 Function *ExitRTLFn =
6600 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6601 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6602
6603 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6604 /*Conditional*/ false, /*hasFinalize*/ true);
6605}
6606
6607OpenMPIRBuilder::InsertPointTy
6608OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6609 InsertPointTy AllocaIP, unsigned NumLoops,
6610 ArrayRef<llvm::Value *> StoreValues,
6611 const Twine &Name, bool IsDependSource) {
6612 assert(
6613 llvm::all_of(StoreValues,
6614 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6615 "OpenMP runtime requires depend vec with i64 type");
6616
6617 if (!updateToLocation(Loc))
6618 return Loc.IP;
6619
6620 // Allocate space for vector and generate alloc instruction.
6621 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6622 Builder.restoreIP(AllocaIP);
6623 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6624 ArgsBase->setAlignment(Align(8));
6625 updateToLocation(Loc);
6626
6627 // Store the index value with offset in depend vector.
6628 for (unsigned I = 0; I < NumLoops; ++I) {
6629 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6630 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6631 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6632 STInst->setAlignment(Align(8));
6633 }
6634
6635 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6636 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6637
6638 uint32_t SrcLocStrSize;
6639 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6640 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6641 Value *ThreadId = getOrCreateThreadID(Ident);
6642 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6643
6644 Function *RTLFn = nullptr;
6645 if (IsDependSource)
6646 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6647 else
6648 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6649 createRuntimeFunctionCall(RTLFn, Args);
6650
6651 return Builder.saveIP();
6652}
6653
6654OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6655 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6656 FinalizeCallbackTy FiniCB, bool IsThreads) {
6657 if (!updateToLocation(Loc))
6658 return Loc.IP;
6659
6660 Directive OMPD = Directive::OMPD_ordered;
6661 Instruction *EntryCall = nullptr;
6662 Instruction *ExitCall = nullptr;
6663
6664 if (IsThreads) {
6665 uint32_t SrcLocStrSize;
6666 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6667 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6668 Value *ThreadId = getOrCreateThreadID(Ident);
6669 Value *Args[] = {Ident, ThreadId};
6670
6671 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6672 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
6673
6674 Function *ExitRTLFn =
6675 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6676 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6677 }
6678
6679 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6680 /*Conditional*/ false, /*hasFinalize*/ true);
6681}
6682
6683OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6684 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6685 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6686 bool HasFinalize, bool IsCancellable) {
6687
6688 if (HasFinalize)
6689 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6690
6691 // Create inlined region's entry and body blocks, in preparation
6692 // for conditional creation
6693 BasicBlock *EntryBB = Builder.GetInsertBlock();
6694 Instruction *SplitPos = EntryBB->getTerminator();
6695 if (!isa_and_nonnull<BranchInst>(SplitPos))
6696 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6697 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6698 BasicBlock *FiniBB =
6699 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6700
6701 Builder.SetInsertPoint(EntryBB->getTerminator());
6702 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6703
6704 // generate body
6705 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6706 /* CodeGenIP */ Builder.saveIP()))
6707 return Err;
6708
6709 // emit exit call and do any needed finalization.
6710 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6711 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6712 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6713 "Unexpected control flow graph state!!");
6714 InsertPointOrErrorTy AfterIP =
6715 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6716 if (!AfterIP)
6717 return AfterIP.takeError();
6718
6719 // If we are skipping the region of a non conditional, remove the exit
6720 // block, and clear the builder's insertion point.
6721 assert(SplitPos->getParent() == ExitBB &&
6722 "Unexpected Insertion point location!");
6723 auto merged = MergeBlockIntoPredecessor(ExitBB);
6724 BasicBlock *ExitPredBB = SplitPos->getParent();
6725 auto InsertBB = merged ? ExitPredBB : ExitBB;
6726 if (!isa_and_nonnull<BranchInst>(SplitPos))
6727 SplitPos->eraseFromParent();
6728 Builder.SetInsertPoint(InsertBB);
6729
6730 return Builder.saveIP();
6731}
6732
6733OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6734 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6735 // if nothing to do, Return current insertion point.
6736 if (!Conditional || !EntryCall)
6737 return Builder.saveIP();
6738
6739 BasicBlock *EntryBB = Builder.GetInsertBlock();
6740 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6741 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6742 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6743
6744 // Emit thenBB and set the Builder's insertion point there for
6745 // body generation next. Place the block after the current block.
6746 Function *CurFn = EntryBB->getParent();
6747 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6748
6749 // Move Entry branch to end of ThenBB, and replace with conditional
6750 // branch (If-stmt)
6751 Instruction *EntryBBTI = EntryBB->getTerminator();
6752 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6753 EntryBBTI->removeFromParent();
6754 Builder.SetInsertPoint(UI);
6755 Builder.Insert(EntryBBTI);
6756 UI->eraseFromParent();
6757 Builder.SetInsertPoint(ThenBB->getTerminator());
6758
6759 // return an insertion point to ExitBB.
6760 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6761}
6762
6763OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6764 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6765 bool HasFinalize) {
6766
6767 Builder.restoreIP(FinIP);
6768
6769 // If there is finalization to do, emit it before the exit call
6770 if (HasFinalize) {
6771 assert(!FinalizationStack.empty() &&
6772 "Unexpected finalization stack state!");
6773
6774 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6775 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6776
6777 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
6778 return std::move(Err);
6779
6780 // Exit condition: insertion point is before the terminator of the new Fini
6781 // block
6782 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
6783 }
6784
6785 if (!ExitCall)
6786 return Builder.saveIP();
6787
6788 // place the Exitcall as last instruction before Finalization block terminator
6789 ExitCall->removeFromParent();
6790 Builder.Insert(ExitCall);
6791
6792 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6793 ExitCall->getIterator());
6794}
6795
6796OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6797 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6798 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6799 if (!IP.isSet())
6800 return IP;
6801
6802 IRBuilder<>::InsertPointGuard IPG(Builder);
6803
6804 // creates the following CFG structure
6805 // OMP_Entry : (MasterAddr != PrivateAddr)?
6806 // F T
6807 // | \
6808 // | copin.not.master
6809 // | /
6810 // v /
6811 // copyin.not.master.end
6812 // |
6813 // v
6814 // OMP.Entry.Next
6815
6816 BasicBlock *OMP_Entry = IP.getBlock();
6817 Function *CurFn = OMP_Entry->getParent();
6818 BasicBlock *CopyBegin =
6819 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6820 BasicBlock *CopyEnd = nullptr;
6821
6822 // If entry block is terminated, split to preserve the branch to following
6823 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6824 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6825 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6826 "copyin.not.master.end");
6827 OMP_Entry->getTerminator()->eraseFromParent();
6828 } else {
6829 CopyEnd =
6830 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6831 }
6832
6833 Builder.SetInsertPoint(OMP_Entry);
6834 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6835 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6836 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6837 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6838
6839 Builder.SetInsertPoint(CopyBegin);
6840 if (BranchtoEnd)
6841 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6842
6843 return Builder.saveIP();
6844}
6845
6846CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6848 std::string Name) {
6849 IRBuilder<>::InsertPointGuard IPG(Builder);
6850 updateToLocation(Loc);
6851
6852 uint32_t SrcLocStrSize;
6853 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6854 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6855 Value *ThreadId = getOrCreateThreadID(Ident);
6856 Value *Args[] = {ThreadId, Size, Allocator};
6857
6858 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6859
6860 return createRuntimeFunctionCall(Fn, Args, Name);
6861}
6862
6863CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6864 Value *Addr, Value *Allocator,
6865 std::string Name) {
6866 IRBuilder<>::InsertPointGuard IPG(Builder);
6867 updateToLocation(Loc);
6868
6869 uint32_t SrcLocStrSize;
6870 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6871 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6872 Value *ThreadId = getOrCreateThreadID(Ident);
6873 Value *Args[] = {ThreadId, Addr, Allocator};
6874 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6875 return createRuntimeFunctionCall(Fn, Args, Name);
6876}
6877
6878CallInst *OpenMPIRBuilder::createOMPInteropInit(
6879 const LocationDescription &Loc, Value *InteropVar,
6880 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6881 Value *DependenceAddress, bool HaveNowaitClause) {
6882 IRBuilder<>::InsertPointGuard IPG(Builder);
6883 updateToLocation(Loc);
6884
6885 uint32_t SrcLocStrSize;
6886 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6887 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6888 Value *ThreadId = getOrCreateThreadID(Ident);
6889 if (Device == nullptr)
6891 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6892 if (NumDependences == nullptr) {
6893 NumDependences = ConstantInt::get(Int32, 0);
6894 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6895 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6896 }
6897 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6898 Value *Args[] = {
6899 Ident, ThreadId, InteropVar, InteropTypeVal,
6900 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6901
6902 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6903
6904 return createRuntimeFunctionCall(Fn, Args);
6905}
6906
6907CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6908 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6909 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6910 IRBuilder<>::InsertPointGuard IPG(Builder);
6911 updateToLocation(Loc);
6912
6913 uint32_t SrcLocStrSize;
6914 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6915 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6916 Value *ThreadId = getOrCreateThreadID(Ident);
6917 if (Device == nullptr)
6919 if (NumDependences == nullptr) {
6920 NumDependences = ConstantInt::get(Int32, 0);
6921 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6922 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6923 }
6924 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6925 Value *Args[] = {
6926 Ident, ThreadId, InteropVar, Device,
6927 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6928
6929 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6930
6931 return createRuntimeFunctionCall(Fn, Args);
6932}
6933
6934CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6935 Value *InteropVar, Value *Device,
6936 Value *NumDependences,
6937 Value *DependenceAddress,
6938 bool HaveNowaitClause) {
6939 IRBuilder<>::InsertPointGuard IPG(Builder);
6940 updateToLocation(Loc);
6941 uint32_t SrcLocStrSize;
6942 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6943 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6944 Value *ThreadId = getOrCreateThreadID(Ident);
6945 if (Device == nullptr)
6947 if (NumDependences == nullptr) {
6948 NumDependences = ConstantInt::get(Int32, 0);
6949 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6950 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6951 }
6952 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6953 Value *Args[] = {
6954 Ident, ThreadId, InteropVar, Device,
6955 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6956
6957 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6958
6959 return createRuntimeFunctionCall(Fn, Args);
6960}
6961
6962CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6963 const LocationDescription &Loc, llvm::Value *Pointer,
6964 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6965 IRBuilder<>::InsertPointGuard IPG(Builder);
6966 updateToLocation(Loc);
6967
6968 uint32_t SrcLocStrSize;
6969 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6970 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6971 Value *ThreadId = getOrCreateThreadID(Ident);
6972 Constant *ThreadPrivateCache =
6973 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6974 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6975
6976 Function *Fn =
6977 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6978
6979 return createRuntimeFunctionCall(Fn, Args);
6980}
6981
6982OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6983 const LocationDescription &Loc,
6984 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6985 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6986 "expected num_threads and num_teams to be specified");
6987
6988 if (!updateToLocation(Loc))
6989 return Loc.IP;
6990
6991 uint32_t SrcLocStrSize;
6992 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6993 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6994 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6995 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6996 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6997 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6998 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6999
7000 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7001 Function *Kernel = DebugKernelWrapper;
7002
7003 // We need to strip the debug prefix to get the correct kernel name.
7004 StringRef KernelName = Kernel->getName();
7005 const std::string DebugPrefix = "_debug__";
7006 if (KernelName.ends_with(DebugPrefix)) {
7007 KernelName = KernelName.drop_back(DebugPrefix.length());
7008 Kernel = M.getFunction(KernelName);
7009 assert(Kernel && "Expected the real kernel to exist");
7010 }
7011
7012 // Manifest the launch configuration in the metadata matching the kernel
7013 // environment.
7014 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7015 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7016
7017 // If MaxThreads not set, select the maximum between the default workgroup
7018 // size and the MinThreads value.
7019 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7020 if (MaxThreadsVal < 0)
7021 MaxThreadsVal = std::max(
7022 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7023
7024 if (MaxThreadsVal > 0)
7025 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7026
7027 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7028 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7029 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7030 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7031 Constant *ReductionDataSize =
7032 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7033 Constant *ReductionBufferLength =
7034 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7035
7036 Function *Fn = getOrCreateRuntimeFunctionPtr(
7037 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7038 const DataLayout &DL = Fn->getDataLayout();
7039
7040 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7041 Constant *DynamicEnvironmentInitializer =
7042 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7043 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7044 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7045 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7046 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7047 DL.getDefaultGlobalsAddressSpace());
7048 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7049
7050 Constant *DynamicEnvironment =
7051 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7052 ? DynamicEnvironmentGV
7053 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7054 DynamicEnvironmentPtr);
7055
7056 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7057 ConfigurationEnvironment, {
7058 UseGenericStateMachineVal,
7059 MayUseNestedParallelismVal,
7060 IsSPMDVal,
7061 MinThreads,
7062 MaxThreads,
7063 MinTeams,
7064 MaxTeams,
7065 ReductionDataSize,
7066 ReductionBufferLength,
7067 });
7068 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7069 KernelEnvironment, {
7070 ConfigurationEnvironmentInitializer,
7071 Ident,
7072 DynamicEnvironment,
7073 });
7074 std::string KernelEnvironmentName =
7075 (KernelName + "_kernel_environment").str();
7076 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7077 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7078 KernelEnvironmentInitializer, KernelEnvironmentName,
7079 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7080 DL.getDefaultGlobalsAddressSpace());
7081 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7082
7083 Constant *KernelEnvironment =
7084 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7085 ? KernelEnvironmentGV
7086 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7087 KernelEnvironmentPtr);
7088 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7089 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7090 KernelLaunchEnvironment =
7091 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7092 ? KernelLaunchEnvironment
7093 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7094 KernelLaunchEnvParamTy);
7095 CallInst *ThreadKind = createRuntimeFunctionCall(
7096 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7097
7098 Value *ExecUserCode = Builder.CreateICmpEQ(
7099 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7100 "exec_user_code");
7101
7102 // ThreadKind = __kmpc_target_init(...)
7103 // if (ThreadKind == -1)
7104 // user_code
7105 // else
7106 // return;
7107
7108 auto *UI = Builder.CreateUnreachable();
7109 BasicBlock *CheckBB = UI->getParent();
7110 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7111
7112 BasicBlock *WorkerExitBB = BasicBlock::Create(
7113 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7114 Builder.SetInsertPoint(WorkerExitBB);
7115 Builder.CreateRetVoid();
7116
7117 auto *CheckBBTI = CheckBB->getTerminator();
7118 Builder.SetInsertPoint(CheckBBTI);
7119 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7120
7121 CheckBBTI->eraseFromParent();
7122 UI->eraseFromParent();
7123
7124 // Continue in the "user_code" block, see diagram above and in
7125 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7126 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7127}
7128
7129void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
7130 int32_t TeamsReductionDataSize,
7131 int32_t TeamsReductionBufferLength) {
7132 if (!updateToLocation(Loc))
7133 return;
7134
7135 Function *Fn = getOrCreateRuntimeFunctionPtr(
7136 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7137
7138 createRuntimeFunctionCall(Fn, {});
7139
7140 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7141 return;
7142
7143 Function *Kernel = Builder.GetInsertBlock()->getParent();
7144 // We need to strip the debug prefix to get the correct kernel name.
7145 StringRef KernelName = Kernel->getName();
7146 const std::string DebugPrefix = "_debug__";
7147 if (KernelName.ends_with(DebugPrefix))
7148 KernelName = KernelName.drop_back(DebugPrefix.length());
7149 auto *KernelEnvironmentGV =
7150 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7151 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7152 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7153 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7154 KernelEnvironmentInitializer,
7155 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7156 NewInitializer = ConstantFoldInsertValueInstruction(
7157 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7158 {0, 8});
7159 KernelEnvironmentGV->setInitializer(NewInitializer);
7160}
7161
7162static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7163 bool Min) {
7164 if (Kernel.hasFnAttribute(Name)) {
7165 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7166 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7167 }
7168 Kernel.addFnAttr(Name, llvm::utostr(Value));
7169}
7170
7171std::pair<int32_t, int32_t>
7172OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
7173 int32_t ThreadLimit =
7174 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7175
7176 if (T.isAMDGPU()) {
7177 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7178 if (!Attr.isValid() || !Attr.isStringAttribute())
7179 return {0, ThreadLimit};
7180 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7181 int32_t LB, UB;
7182 if (!llvm::to_integer(UBStr, UB, 10))
7183 return {0, ThreadLimit};
7184 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7185 if (!llvm::to_integer(LBStr, LB, 10))
7186 return {0, UB};
7187 return {LB, UB};
7188 }
7189
7190 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7191 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7192 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7193 }
7194 return {0, ThreadLimit};
7195}
7196
7197void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
7198 Function &Kernel, int32_t LB,
7199 int32_t UB) {
7200 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7201
7202 if (T.isAMDGPU()) {
7203 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7204 llvm::utostr(LB) + "," + llvm::utostr(UB));
7205 return;
7206 }
7207
7208 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7209}
7210
7211std::pair<int32_t, int32_t>
7212OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
7213 // TODO: Read from backend annotations if available.
7214 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7215}
7216
7217void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7218 int32_t LB, int32_t UB) {
7219 if (T.isNVPTX())
7220 if (UB > 0)
7221 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7222 if (T.isAMDGPU())
7223 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7224
7225 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7226}
7227
7228void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7229 Function *OutlinedFn) {
7230 if (Config.isTargetDevice()) {
7232 // TODO: Determine if DSO local can be set to true.
7233 OutlinedFn->setDSOLocal(false);
7235 if (T.isAMDGCN())
7237 else if (T.isNVPTX())
7239 else if (T.isSPIRV())
7241 }
7242}
7243
7244Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7245 StringRef EntryFnIDName) {
7246 if (Config.isTargetDevice()) {
7247 assert(OutlinedFn && "The outlined function must exist if embedded");
7248 return OutlinedFn;
7249 }
7250
7251 return new GlobalVariable(
7252 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7253 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7254}
7255
7256Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7257 StringRef EntryFnName) {
7258 if (OutlinedFn)
7259 return OutlinedFn;
7260
7261 assert(!M.getGlobalVariable(EntryFnName, true) &&
7262 "Named kernel already exists?");
7263 return new GlobalVariable(
7264 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7265 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7266}
7267
7268Error OpenMPIRBuilder::emitTargetRegionFunction(
7269 TargetRegionEntryInfo &EntryInfo,
7270 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7271 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7272
7273 SmallString<64> EntryFnName;
7274 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7275
7276 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7277 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7278 if (!CBResult)
7279 return CBResult.takeError();
7280 OutlinedFn = *CBResult;
7281 } else {
7282 OutlinedFn = nullptr;
7283 }
7284
7285 // If this target outline function is not an offload entry, we don't need to
7286 // register it. This may be in the case of a false if clause, or if there are
7287 // no OpenMP targets.
7288 if (!IsOffloadEntry)
7289 return Error::success();
7290
7291 std::string EntryFnIDName =
7292 Config.isTargetDevice()
7293 ? std::string(EntryFnName)
7294 : createPlatformSpecificName({EntryFnName, "region_id"});
7295
7296 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7297 EntryFnName, EntryFnIDName);
7298 return Error::success();
7299}
7300
7301Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7302 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7303 StringRef EntryFnName, StringRef EntryFnIDName) {
7304 if (OutlinedFn)
7305 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7306 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7307 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7308 OffloadInfoManager.registerTargetRegionEntryInfo(
7309 EntryInfo, EntryAddr, OutlinedFnID,
7310 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7311 return OutlinedFnID;
7312}
7313
7314OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7315 const LocationDescription &Loc, InsertPointTy AllocaIP,
7316 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7317 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7318 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7319 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7320 BodyGenTy BodyGenType)>
7321 BodyGenCB,
7322 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7323 if (!updateToLocation(Loc))
7324 return InsertPointTy();
7325
7326 Builder.restoreIP(CodeGenIP);
7327 // Disable TargetData CodeGen on Device pass.
7328 if (Config.IsTargetDevice.value_or(false)) {
7329 if (BodyGenCB) {
7330 InsertPointOrErrorTy AfterIP =
7331 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7332 if (!AfterIP)
7333 return AfterIP.takeError();
7334 Builder.restoreIP(*AfterIP);
7335 }
7336 return Builder.saveIP();
7337 }
7338
7339 bool IsStandAlone = !BodyGenCB;
7340 MapInfosTy *MapInfo;
7341 // Generate the code for the opening of the data environment. Capture all the
7342 // arguments of the runtime call by reference because they are used in the
7343 // closing of the region.
7344 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7345 InsertPointTy CodeGenIP) -> Error {
7346 MapInfo = &GenMapInfoCB(Builder.saveIP());
7347 if (Error Err = emitOffloadingArrays(
7348 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7349 /*IsNonContiguous=*/true, DeviceAddrCB))
7350 return Err;
7351
7352 TargetDataRTArgs RTArgs;
7353 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7354
7355 // Emit the number of elements in the offloading arrays.
7356 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7357
7358 // Source location for the ident struct
7359 if (!SrcLocInfo) {
7360 uint32_t SrcLocStrSize;
7361 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7362 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7363 }
7364
7365 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7366 SrcLocInfo, DeviceID,
7367 PointerNum, RTArgs.BasePointersArray,
7368 RTArgs.PointersArray, RTArgs.SizesArray,
7369 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7370 RTArgs.MappersArray};
7371
7372 if (IsStandAlone) {
7373 assert(MapperFunc && "MapperFunc missing for standalone target data");
7374
7375 auto TaskBodyCB = [&](Value *, Value *,
7377 if (Info.HasNoWait) {
7378 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7382 }
7383
7384 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7385 OffloadingArgs);
7386
7387 if (Info.HasNoWait) {
7388 BasicBlock *OffloadContBlock =
7389 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7390 Function *CurFn = Builder.GetInsertBlock()->getParent();
7391 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7392 Builder.restoreIP(Builder.saveIP());
7393 }
7394 return Error::success();
7395 };
7396
7397 bool RequiresOuterTargetTask = Info.HasNoWait;
7398 if (!RequiresOuterTargetTask)
7399 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7400 /*TargetTaskAllocaIP=*/{}));
7401 else
7402 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7403 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7404 } else {
7405 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7406 omp::OMPRTL___tgt_target_data_begin_mapper);
7407
7408 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
7409
7410 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7411 if (isa<AllocaInst>(DeviceMap.second.second)) {
7412 auto *LI =
7413 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7414 Builder.CreateStore(LI, DeviceMap.second.second);
7415 }
7416 }
7417
7418 // If device pointer privatization is required, emit the body of the
7419 // region here. It will have to be duplicated: with and without
7420 // privatization.
7421 InsertPointOrErrorTy AfterIP =
7422 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7423 if (!AfterIP)
7424 return AfterIP.takeError();
7425 Builder.restoreIP(*AfterIP);
7426 }
7427 return Error::success();
7428 };
7429
7430 // If we need device pointer privatization, we need to emit the body of the
7431 // region with no privatization in the 'else' branch of the conditional.
7432 // Otherwise, we don't have to do anything.
7433 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7434 InsertPointTy CodeGenIP) -> Error {
7435 InsertPointOrErrorTy AfterIP =
7436 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7437 if (!AfterIP)
7438 return AfterIP.takeError();
7439 Builder.restoreIP(*AfterIP);
7440 return Error::success();
7441 };
7442
7443 // Generate code for the closing of the data region.
7444 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7445 TargetDataRTArgs RTArgs;
7446 Info.EmitDebug = !MapInfo->Names.empty();
7447 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7448
7449 // Emit the number of elements in the offloading arrays.
7450 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7451
7452 // Source location for the ident struct
7453 if (!SrcLocInfo) {
7454 uint32_t SrcLocStrSize;
7455 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7456 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7457 }
7458
7459 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7460 PointerNum, RTArgs.BasePointersArray,
7461 RTArgs.PointersArray, RTArgs.SizesArray,
7462 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7463 RTArgs.MappersArray};
7464 Function *EndMapperFunc =
7465 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7466
7467 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
7468 return Error::success();
7469 };
7470
7471 // We don't have to do anything to close the region if the if clause evaluates
7472 // to false.
7473 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7474 return Error::success();
7475 };
7476
7477 Error Err = [&]() -> Error {
7478 if (BodyGenCB) {
7479 Error Err = [&]() {
7480 if (IfCond)
7481 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7482 return BeginThenGen(AllocaIP, Builder.saveIP());
7483 }();
7484
7485 if (Err)
7486 return Err;
7487
7488 // If we don't require privatization of device pointers, we emit the body
7489 // in between the runtime calls. This avoids duplicating the body code.
7490 InsertPointOrErrorTy AfterIP =
7491 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7492 if (!AfterIP)
7493 return AfterIP.takeError();
7494 restoreIPandDebugLoc(Builder, *AfterIP);
7495
7496 if (IfCond)
7497 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7498 return EndThenGen(AllocaIP, Builder.saveIP());
7499 }
7500 if (IfCond)
7501 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7502 return BeginThenGen(AllocaIP, Builder.saveIP());
7503 }();
7504
7505 if (Err)
7506 return Err;
7507
7508 return Builder.saveIP();
7509}
7510
7512OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7513 bool IsGPUDistribute) {
7514 assert((IVSize == 32 || IVSize == 64) &&
7515 "IV size is not compatible with the omp runtime");
7517 if (IsGPUDistribute)
7518 Name = IVSize == 32
7519 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7520 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7521 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7522 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7523 else
7524 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7525 : omp::OMPRTL___kmpc_for_static_init_4u)
7526 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7527 : omp::OMPRTL___kmpc_for_static_init_8u);
7528
7529 return getOrCreateRuntimeFunction(M, Name);
7530}
7531
7532FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7533 bool IVSigned) {
7534 assert((IVSize == 32 || IVSize == 64) &&
7535 "IV size is not compatible with the omp runtime");
7536 RuntimeFunction Name = IVSize == 32
7537 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7538 : omp::OMPRTL___kmpc_dispatch_init_4u)
7539 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7540 : omp::OMPRTL___kmpc_dispatch_init_8u);
7541
7542 return getOrCreateRuntimeFunction(M, Name);
7543}
7544
7545FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7546 bool IVSigned) {
7547 assert((IVSize == 32 || IVSize == 64) &&
7548 "IV size is not compatible with the omp runtime");
7549 RuntimeFunction Name = IVSize == 32
7550 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7551 : omp::OMPRTL___kmpc_dispatch_next_4u)
7552 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7553 : omp::OMPRTL___kmpc_dispatch_next_8u);
7554
7555 return getOrCreateRuntimeFunction(M, Name);
7556}
7557
7558FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7559 bool IVSigned) {
7560 assert((IVSize == 32 || IVSize == 64) &&
7561 "IV size is not compatible with the omp runtime");
7562 RuntimeFunction Name = IVSize == 32
7563 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7564 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7565 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7566 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7567
7568 return getOrCreateRuntimeFunction(M, Name);
7569}
7570
7571FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7572 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7573}
7574
7576 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7577 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7578
7579 DISubprogram *NewSP = Func->getSubprogram();
7580 if (!NewSP)
7581 return;
7582
7584
7585 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7586 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7587 // Only use cached variable if the arg number matches. This is important
7588 // so that DIVariable created for privatized variables are not discarded.
7589 if (NewVar && (arg == NewVar->getArg()))
7590 return NewVar;
7591
7593 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7594 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7595 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7596 return NewVar;
7597 };
7598
7599 auto UpdateDebugRecord = [&](auto *DR) {
7600 DILocalVariable *OldVar = DR->getVariable();
7601 unsigned ArgNo = 0;
7602 for (auto Loc : DR->location_ops()) {
7603 auto Iter = ValueReplacementMap.find(Loc);
7604 if (Iter != ValueReplacementMap.end()) {
7605 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7606 ArgNo = std::get<1>(Iter->second) + 1;
7607 }
7608 }
7609 if (ArgNo != 0)
7610 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7611 };
7612
7613 // The location and scope of variable intrinsics and records still point to
7614 // the parent function of the target region. Update them.
7615 for (Instruction &I : instructions(Func)) {
7617 "Unexpected debug intrinsic");
7618 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7619 UpdateDebugRecord(&DVR);
7620 }
7621 // An extra argument is passed to the device. Create the debug data for it.
7622 if (OMPBuilder.Config.isTargetDevice()) {
7623 DICompileUnit *CU = NewSP->getUnit();
7624 Module *M = Func->getParent();
7625 DIBuilder DB(*M, true, CU);
7626 DIType *VoidPtrTy =
7627 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7628 DILocalVariable *Var = DB.createParameterVariable(
7629 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7630 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7631 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7632 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7633 &(*Func->begin()));
7634 }
7635}
7636
7638 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7639 return cast<Operator>(V)->getOperand(0);
7640 return V;
7641}
7642
7644 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7645 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7646 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7647 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7648 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7649 SmallVector<Type *> ParameterTypes;
7650 if (OMPBuilder.Config.isTargetDevice()) {
7651 // Add the "implicit" runtime argument we use to provide launch specific
7652 // information for target devices.
7653 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7654 ParameterTypes.push_back(Int8PtrTy);
7655
7656 // All parameters to target devices are passed as pointers
7657 // or i64. This assumes 64-bit address spaces/pointers.
7658 for (auto &Arg : Inputs)
7659 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7660 ? Arg->getType()
7661 : Type::getInt64Ty(Builder.getContext()));
7662 } else {
7663 for (auto &Arg : Inputs)
7664 ParameterTypes.push_back(Arg->getType());
7665 }
7666
7667 auto BB = Builder.GetInsertBlock();
7668 auto M = BB->getModule();
7669 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7670 /*isVarArg*/ false);
7671 auto Func =
7672 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7673
7674 // Forward target-cpu and target-features function attributes from the
7675 // original function to the new outlined function.
7676 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7677
7678 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7679 if (TargetCpuAttr.isStringAttribute())
7680 Func->addFnAttr(TargetCpuAttr);
7681
7682 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7683 if (TargetFeaturesAttr.isStringAttribute())
7684 Func->addFnAttr(TargetFeaturesAttr);
7685
7686 if (OMPBuilder.Config.isTargetDevice()) {
7687 Value *ExecMode =
7688 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7689 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7690 }
7691
7692 // Save insert point.
7693 IRBuilder<>::InsertPointGuard IPG(Builder);
7694 // We will generate the entries in the outlined function but the debug
7695 // location may still be pointing to the parent function. Reset it now.
7696 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7697
7698 // Generate the region into the function.
7699 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7700 Builder.SetInsertPoint(EntryBB);
7701
7702 // Insert target init call in the device compilation pass.
7703 if (OMPBuilder.Config.isTargetDevice())
7704 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7705
7706 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7707
7708 // As we embed the user code in the middle of our target region after we
7709 // generate entry code, we must move what allocas we can into the entry
7710 // block to avoid possible breaking optimisations for device
7711 if (OMPBuilder.Config.isTargetDevice())
7712 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7713
7714 // Insert target deinit call in the device compilation pass.
7715 BasicBlock *OutlinedBodyBB =
7716 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7717 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7718 Builder.saveIP(),
7719 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7720 if (!AfterIP)
7721 return AfterIP.takeError();
7722 Builder.restoreIP(*AfterIP);
7723 if (OMPBuilder.Config.isTargetDevice())
7724 OMPBuilder.createTargetDeinit(Builder);
7725
7726 // Insert return instruction.
7727 Builder.CreateRetVoid();
7728
7729 // New Alloca IP at entry point of created device function.
7730 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7731 auto AllocaIP = Builder.saveIP();
7732
7733 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7734
7735 // Skip the artificial dyn_ptr on the device.
7736 const auto &ArgRange =
7737 OMPBuilder.Config.isTargetDevice()
7738 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7739 : Func->args();
7740
7742
7743 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7744 // Things like GEP's can come in the form of Constants. Constants and
7745 // ConstantExpr's do not have access to the knowledge of what they're
7746 // contained in, so we must dig a little to find an instruction so we
7747 // can tell if they're used inside of the function we're outlining. We
7748 // also replace the original constant expression with a new instruction
7749 // equivalent; an instruction as it allows easy modification in the
7750 // following loop, as we can now know the constant (instruction) is
7751 // owned by our target function and replaceUsesOfWith can now be invoked
7752 // on it (cannot do this with constants it seems). A brand new one also
7753 // allows us to be cautious as it is perhaps possible the old expression
7754 // was used inside of the function but exists and is used externally
7755 // (unlikely by the nature of a Constant, but still).
7756 // NOTE: We cannot remove dead constants that have been rewritten to
7757 // instructions at this stage, we run the risk of breaking later lowering
7758 // by doing so as we could still be in the process of lowering the module
7759 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7760 // constants we have created rewritten versions of.
7761 if (auto *Const = dyn_cast<Constant>(Input))
7762 convertUsersOfConstantsToInstructions(Const, Func, false);
7763
7764 // Collect users before iterating over them to avoid invalidating the
7765 // iteration in case a user uses Input more than once (e.g. a call
7766 // instruction).
7767 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7768 // Collect all the instructions
7770 if (auto *Instr = dyn_cast<Instruction>(User))
7771 if (Instr->getFunction() == Func)
7772 Instr->replaceUsesOfWith(Input, InputCopy);
7773 };
7774
7775 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7776
7777 // Rewrite uses of input valus to parameters.
7778 for (auto InArg : zip(Inputs, ArgRange)) {
7779 Value *Input = std::get<0>(InArg);
7780 Argument &Arg = std::get<1>(InArg);
7781 Value *InputCopy = nullptr;
7782
7783 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7784 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7785 if (!AfterIP)
7786 return AfterIP.takeError();
7787 Builder.restoreIP(*AfterIP);
7788 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7789
7790 // In certain cases a Global may be set up for replacement, however, this
7791 // Global may be used in multiple arguments to the kernel, just segmented
7792 // apart, for example, if we have a global array, that is sectioned into
7793 // multiple mappings (technically not legal in OpenMP, but there is a case
7794 // in Fortran for Common Blocks where this is neccesary), we will end up
7795 // with GEP's into this array inside the kernel, that refer to the Global
7796 // but are technically seperate arguments to the kernel for all intents and
7797 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7798 // index, it will fold into an referal to the Global, if we then encounter
7799 // this folded GEP during replacement all of the references to the
7800 // Global in the kernel will be replaced with the argument we have generated
7801 // that corresponds to it, including any other GEP's that refer to the
7802 // Global that may be other arguments. This will invalidate all of the other
7803 // preceding mapped arguments that refer to the same global that may be
7804 // seperate segments. To prevent this, we defer global processing until all
7805 // other processing has been performed.
7808 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7809 continue;
7810 }
7811
7813 continue;
7814
7815 ReplaceValue(Input, InputCopy, Func);
7816 }
7817
7818 // Replace all of our deferred Input values, currently just Globals.
7819 for (auto Deferred : DeferredReplacement)
7820 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7821
7822 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7823 ValueReplacementMap);
7824 return Func;
7825}
7826/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7827/// of pointers containing shared data between the parent task and the created
7828/// task.
7829static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7830 IRBuilderBase &Builder,
7831 Value *TaskWithPrivates,
7832 Type *TaskWithPrivatesTy) {
7833
7834 Type *TaskTy = OMPIRBuilder.Task;
7835 LLVMContext &Ctx = Builder.getContext();
7836 Value *TaskT =
7837 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7838 Value *Shareds = TaskT;
7839 // TaskWithPrivatesTy can be one of the following
7840 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7841 // %struct.privates }
7842 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7843 //
7844 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7845 // its first member has to be the task descriptor. TaskTy is the type of the
7846 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7847 // first member of TaskT, gives us the pointer to shared data.
7848 if (TaskWithPrivatesTy != TaskTy)
7849 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7850 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7851}
7852/// Create an entry point for a target task with the following.
7853/// It'll have the following signature
7854/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7855/// This function is called from emitTargetTask once the
7856/// code to launch the target kernel has been outlined already.
7857/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7858/// into the task structure so that the deferred target task can access this
7859/// data even after the stack frame of the generating task has been rolled
7860/// back. Offloading arrays contain base pointers, pointers, sizes etc
7861/// of the data that the target kernel will access. These in effect are the
7862/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7864 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7865 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7866 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7867
7868 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7869 // This is because PrivatesTy is the type of the structure in which
7870 // we pass the offloading arrays to the deferred target task.
7871 assert((!NumOffloadingArrays || PrivatesTy) &&
7872 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7873 "to privatize");
7874
7875 Module &M = OMPBuilder.M;
7876 // KernelLaunchFunction is the target launch function, i.e.
7877 // the function that sets up kernel arguments and calls
7878 // __tgt_target_kernel to launch the kernel on the device.
7879 //
7880 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7881
7882 // StaleCI is the CallInst which is the call to the outlined
7883 // target kernel launch function. If there are local live-in values
7884 // that the outlined function uses then these are aggregated into a structure
7885 // which is passed as the second argument. If there are no local live-in
7886 // values or if all values used by the outlined kernel are global variables,
7887 // then there's only one argument, the threadID. So, StaleCI can be
7888 //
7889 // %structArg = alloca { ptr, ptr }, align 8
7890 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7891 // store ptr %20, ptr %gep_, align 8
7892 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7893 // store ptr %21, ptr %gep_8, align 8
7894 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7895 //
7896 // OR
7897 //
7898 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7899 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7900 StaleCI->getIterator());
7901
7902 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7903
7904 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7905 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7906 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7907
7908 auto ProxyFnTy =
7909 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7910 /* isVarArg */ false);
7911 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7912 ".omp_target_task_proxy_func",
7913 Builder.GetInsertBlock()->getModule());
7914 Value *ThreadId = ProxyFn->getArg(0);
7915 Value *TaskWithPrivates = ProxyFn->getArg(1);
7916 ThreadId->setName("thread.id");
7917 TaskWithPrivates->setName("task");
7918
7919 bool HasShareds = SharedArgsOperandNo > 0;
7920 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7921 BasicBlock *EntryBB =
7922 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7923 Builder.SetInsertPoint(EntryBB);
7924
7925 SmallVector<Value *> KernelLaunchArgs;
7926 KernelLaunchArgs.reserve(StaleCI->arg_size());
7927 KernelLaunchArgs.push_back(ThreadId);
7928
7929 if (HasOffloadingArrays) {
7930 assert(TaskTy != TaskWithPrivatesTy &&
7931 "If there are offloading arrays to pass to the target"
7932 "TaskTy cannot be the same as TaskWithPrivatesTy");
7933 (void)TaskTy;
7934 Value *Privates =
7935 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7936 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7937 KernelLaunchArgs.push_back(
7938 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7939 }
7940
7941 if (HasShareds) {
7942 auto *ArgStructAlloca =
7943 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7944 assert(ArgStructAlloca &&
7945 "Unable to find the alloca instruction corresponding to arguments "
7946 "for extracted function");
7947 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7948
7949 AllocaInst *NewArgStructAlloca =
7950 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7951
7952 Value *SharedsSize =
7953 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7954
7956 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7957
7958 Builder.CreateMemCpy(
7959 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7960 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7961 KernelLaunchArgs.push_back(NewArgStructAlloca);
7962 }
7963 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
7964 Builder.CreateRetVoid();
7965 return ProxyFn;
7966}
7968
7969 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7970 return GEP->getSourceElementType();
7971 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7972 return Alloca->getAllocatedType();
7973
7974 llvm_unreachable("Unhandled Instruction type");
7975 return nullptr;
7976}
7977// This function returns a struct that has at most two members.
7978// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7979// descriptor. The second member, if needed, is a struct containing arrays
7980// that need to be passed to the offloaded target kernel. For example,
7981// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7982// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7983// respectively, then the types created by this function are
7984//
7985// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7986// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7987// %struct.privates }
7988// %struct.task_with_privates is returned by this function.
7989// If there aren't any offloading arrays to pass to the target kernel,
7990// %struct.kmp_task_ompbuilder_t is returned.
7991static StructType *
7992createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7993 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7994
7995 if (OffloadingArraysToPrivatize.empty())
7996 return OMPIRBuilder.Task;
7997
7998 SmallVector<Type *, 4> StructFieldTypes;
7999 for (Value *V : OffloadingArraysToPrivatize) {
8000 assert(V->getType()->isPointerTy() &&
8001 "Expected pointer to array to privatize. Got a non-pointer value "
8002 "instead");
8003 Type *ArrayTy = getOffloadingArrayType(V);
8004 assert(ArrayTy && "ArrayType cannot be nullptr");
8005 StructFieldTypes.push_back(ArrayTy);
8006 }
8007 StructType *PrivatesStructTy =
8008 StructType::create(StructFieldTypes, "struct.privates");
8009 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8010 "struct.task_with_privates");
8011}
8013 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8014 TargetRegionEntryInfo &EntryInfo,
8015 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8016 Function *&OutlinedFn, Constant *&OutlinedFnID,
8018 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
8019 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
8020
8021 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8022 [&](StringRef EntryFnName) {
8023 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8024 EntryFnName, Inputs, CBFunc,
8025 ArgAccessorFuncCB);
8026 };
8027
8028 return OMPBuilder.emitTargetRegionFunction(
8029 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8030 OutlinedFnID);
8031}
8032
8033OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
8034 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8035 OpenMPIRBuilder::InsertPointTy AllocaIP,
8037 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8038
8039 // The following explains the code-gen scenario for the `target` directive. A
8040 // similar scneario is followed for other device-related directives (e.g.
8041 // `target enter data`) but in similar fashion since we only need to emit task
8042 // that encapsulates the proper runtime call.
8043 //
8044 // When we arrive at this function, the target region itself has been
8045 // outlined into the function OutlinedFn.
8046 // So at ths point, for
8047 // --------------------------------------------------------------
8048 // void user_code_that_offloads(...) {
8049 // omp target depend(..) map(from:a) map(to:b) private(i)
8050 // do i = 1, 10
8051 // a(i) = b(i) + n
8052 // }
8053 //
8054 // --------------------------------------------------------------
8055 //
8056 // we have
8057 //
8058 // --------------------------------------------------------------
8059 //
8060 // void user_code_that_offloads(...) {
8061 // %.offload_baseptrs = alloca [2 x ptr], align 8
8062 // %.offload_ptrs = alloca [2 x ptr], align 8
8063 // %.offload_mappers = alloca [2 x ptr], align 8
8064 // ;; target region has been outlined and now we need to
8065 // ;; offload to it via a target task.
8066 // }
8067 // void outlined_device_function(ptr a, ptr b, ptr n) {
8068 // n = *n_ptr;
8069 // do i = 1, 10
8070 // a(i) = b(i) + n
8071 // }
8072 //
8073 // We have to now do the following
8074 // (i) Make an offloading call to outlined_device_function using the OpenMP
8075 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8076 // emitted by emitKernelLaunch
8077 // (ii) Create a task entry point function that calls kernel_launch_function
8078 // and is the entry point for the target task. See
8079 // '@.omp_target_task_proxy_func in the pseudocode below.
8080 // (iii) Create a task with the task entry point created in (ii)
8081 //
8082 // That is we create the following
8083 // struct task_with_privates {
8084 // struct kmp_task_ompbuilder_t task_struct;
8085 // struct privates {
8086 // [2 x ptr] ; baseptrs
8087 // [2 x ptr] ; ptrs
8088 // [2 x i64] ; sizes
8089 // }
8090 // }
8091 // void user_code_that_offloads(...) {
8092 // %.offload_baseptrs = alloca [2 x ptr], align 8
8093 // %.offload_ptrs = alloca [2 x ptr], align 8
8094 // %.offload_sizes = alloca [2 x i64], align 8
8095 //
8096 // %structArg = alloca { ptr, ptr, ptr }, align 8
8097 // %strucArg[0] = a
8098 // %strucArg[1] = b
8099 // %strucArg[2] = &n
8100 //
8101 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8102 // sizeof(kmp_task_ompbuilder_t),
8103 // sizeof(structArg),
8104 // @.omp_target_task_proxy_func,
8105 // ...)
8106 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8107 // sizeof(structArg))
8108 // memcpy(target_task_with_privates->privates->baseptrs,
8109 // offload_baseptrs, sizeof(offload_baseptrs)
8110 // memcpy(target_task_with_privates->privates->ptrs,
8111 // offload_ptrs, sizeof(offload_ptrs)
8112 // memcpy(target_task_with_privates->privates->sizes,
8113 // offload_sizes, sizeof(offload_sizes)
8114 // dependencies_array = ...
8115 // ;; if nowait not present
8116 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8117 // call @__kmpc_omp_task_begin_if0(...)
8118 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8119 // %target_task_with_privates)
8120 // call @__kmpc_omp_task_complete_if0(...)
8121 // }
8122 //
8123 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8124 // ptr %task) {
8125 // %structArg = alloca {ptr, ptr, ptr}
8126 // %task_ptr = getelementptr(%task, 0, 0)
8127 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8128 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8129 //
8130 // %offloading_arrays = getelementptr(%task, 0, 1)
8131 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8132 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8133 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8134 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8135 // %offload_sizes, %structArg)
8136 // }
8137 //
8138 // We need the proxy function because the signature of the task entry point
8139 // expected by kmpc_omp_task is always the same and will be different from
8140 // that of the kernel_launch function.
8141 //
8142 // kernel_launch_function is generated by emitKernelLaunch and has the
8143 // always_inline attribute. For this example, it'll look like so:
8144 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8145 // %offload_sizes, %structArg) alwaysinline {
8146 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8147 // ; load aggregated data from %structArg
8148 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8149 // ; offload_sizes
8150 // call i32 @__tgt_target_kernel(...,
8151 // outlined_device_function,
8152 // ptr %kernel_args)
8153 // }
8154 // void outlined_device_function(ptr a, ptr b, ptr n) {
8155 // n = *n_ptr;
8156 // do i = 1, 10
8157 // a(i) = b(i) + n
8158 // }
8159 //
8160 BasicBlock *TargetTaskBodyBB =
8161 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8162 BasicBlock *TargetTaskAllocaBB =
8163 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8164
8165 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8166 TargetTaskAllocaBB->begin());
8167 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8168
8169 OutlineInfo OI;
8170 OI.EntryBB = TargetTaskAllocaBB;
8171 OI.OuterAllocaBB = AllocaIP.getBlock();
8172
8173 // Add the thread ID argument.
8175 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
8176 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8177
8178 // Generate the task body which will subsequently be outlined.
8179 Builder.restoreIP(TargetTaskBodyIP);
8180 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8181 return Err;
8182
8183 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8184 // it is given. These blocks are enumerated by
8185 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8186 // to be outside the region. In other words, OI.ExitBlock is expected to be
8187 // the start of the region after the outlining. We used to set OI.ExitBlock
8188 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8189 // except when the task body is a single basic block. In that case,
8190 // OI.ExitBlock is set to the single task body block and will get left out of
8191 // the outlining process. So, simply create a new empty block to which we
8192 // uncoditionally branch from where TaskBodyCB left off
8193 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8194 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8195 /*IsFinished=*/true);
8196
8197 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8198 bool NeedsTargetTask = HasNoWait && DeviceID;
8199 if (NeedsTargetTask) {
8200 for (auto *V :
8201 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8202 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8203 RTArgs.SizesArray}) {
8205 OffloadingArraysToPrivatize.push_back(V);
8206 OI.ExcludeArgsFromAggregate.push_back(V);
8207 }
8208 }
8209 }
8210 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8211 DeviceID, OffloadingArraysToPrivatize](
8212 Function &OutlinedFn) mutable {
8213 assert(OutlinedFn.hasOneUse() &&
8214 "there must be a single user for the outlined function");
8215
8216 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8217
8218 // The first argument of StaleCI is always the thread id.
8219 // The next few arguments are the pointers to offloading arrays
8220 // if any. (see OffloadingArraysToPrivatize)
8221 // Finally, all other local values that are live-in into the outlined region
8222 // end up in a structure whose pointer is passed as the last argument. This
8223 // piece of data is passed in the "shared" field of the task structure. So,
8224 // we know we have to pass shareds to the task if the number of arguments is
8225 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8226 // thread id. Further, for safety, we assert that the number of arguments of
8227 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8228 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8229 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8230 assert((!HasShareds ||
8231 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8232 "Wrong number of arguments for StaleCI when shareds are present");
8233 int SharedArgOperandNo =
8234 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8235
8236 StructType *TaskWithPrivatesTy =
8237 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8238 StructType *PrivatesTy = nullptr;
8239
8240 if (!OffloadingArraysToPrivatize.empty())
8241 PrivatesTy =
8242 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8243
8245 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8246 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8247
8248 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8249 << "\n");
8250
8251 Builder.SetInsertPoint(StaleCI);
8252
8253 // Gather the arguments for emitting the runtime call.
8254 uint32_t SrcLocStrSize;
8255 Constant *SrcLocStr =
8256 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8257 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8258
8259 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8260 //
8261 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8262 // the DeviceID to the deferred task and also since
8263 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8264 Function *TaskAllocFn =
8265 !NeedsTargetTask
8266 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8267 : getOrCreateRuntimeFunctionPtr(
8268 OMPRTL___kmpc_omp_target_task_alloc);
8269
8270 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8271 // call.
8272 Value *ThreadID = getOrCreateThreadID(Ident);
8273
8274 // Argument - `sizeof_kmp_task_t` (TaskSize)
8275 // Tasksize refers to the size in bytes of kmp_task_t data structure
8276 // plus any other data to be passed to the target task, if any, which
8277 // is packed into a struct. kmp_task_t and the struct so created are
8278 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8279 Value *TaskSize = Builder.getInt64(
8280 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8281
8282 // Argument - `sizeof_shareds` (SharedsSize)
8283 // SharedsSize refers to the shareds array size in the kmp_task_t data
8284 // structure.
8285 Value *SharedsSize = Builder.getInt64(0);
8286 if (HasShareds) {
8287 auto *ArgStructAlloca =
8288 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8289 assert(ArgStructAlloca &&
8290 "Unable to find the alloca instruction corresponding to arguments "
8291 "for extracted function");
8292 auto *ArgStructType =
8293 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8294 assert(ArgStructType && "Unable to find struct type corresponding to "
8295 "arguments for extracted function");
8296 SharedsSize =
8297 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8298 }
8299
8300 // Argument - `flags`
8301 // Task is tied iff (Flags & 1) == 1.
8302 // Task is untied iff (Flags & 1) == 0.
8303 // Task is final iff (Flags & 2) == 2.
8304 // Task is not final iff (Flags & 2) == 0.
8305 // A target task is not final and is untied.
8306 Value *Flags = Builder.getInt32(0);
8307
8308 // Emit the @__kmpc_omp_task_alloc runtime call
8309 // The runtime call returns a pointer to an area where the task captured
8310 // variables must be copied before the task is run (TaskData)
8311 CallInst *TaskData = nullptr;
8312
8313 SmallVector<llvm::Value *> TaskAllocArgs = {
8314 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8315 /*flags=*/Flags,
8316 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8317 /*task_func=*/ProxyFn};
8318
8319 if (NeedsTargetTask) {
8320 assert(DeviceID && "Expected non-empty device ID.");
8321 TaskAllocArgs.push_back(DeviceID);
8322 }
8323
8324 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
8325
8326 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8327 if (HasShareds) {
8328 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8330 *this, Builder, TaskData, TaskWithPrivatesTy);
8331 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8332 SharedsSize);
8333 }
8334 if (!OffloadingArraysToPrivatize.empty()) {
8335 Value *Privates =
8336 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8337 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8338 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8339 [[maybe_unused]] Type *ArrayType =
8340 getOffloadingArrayType(PtrToPrivatize);
8341 assert(ArrayType && "ArrayType cannot be nullptr");
8342
8343 Type *ElementType = PrivatesTy->getElementType(i);
8344 assert(ElementType == ArrayType &&
8345 "ElementType should match ArrayType");
8346 (void)ArrayType;
8347
8348 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8349 Builder.CreateMemCpy(
8350 Dst, Alignment, PtrToPrivatize, Alignment,
8351 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8352 }
8353 }
8354
8355 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8356
8357 // ---------------------------------------------------------------
8358 // V5.2 13.8 target construct
8359 // If the nowait clause is present, execution of the target task
8360 // may be deferred. If the nowait clause is not present, the target task is
8361 // an included task.
8362 // ---------------------------------------------------------------
8363 // The above means that the lack of a nowait on the target construct
8364 // translates to '#pragma omp task if(0)'
8365 if (!NeedsTargetTask) {
8366 if (DepArray) {
8367 Function *TaskWaitFn =
8368 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8369 createRuntimeFunctionCall(
8370 TaskWaitFn,
8371 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8372 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8373 /*dep_list=*/DepArray,
8374 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8375 /*noalias_dep_list=*/
8377 }
8378 // Included task.
8379 Function *TaskBeginFn =
8380 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8381 Function *TaskCompleteFn =
8382 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8383 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8384 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
8385 CI->setDebugLoc(StaleCI->getDebugLoc());
8386 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8387 } else if (DepArray) {
8388 // HasNoWait - meaning the task may be deferred. Call
8389 // __kmpc_omp_task_with_deps if there are dependencies,
8390 // else call __kmpc_omp_task
8391 Function *TaskFn =
8392 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8393 createRuntimeFunctionCall(
8394 TaskFn,
8395 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8396 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8398 } else {
8399 // Emit the @__kmpc_omp_task runtime call to spawn the task
8400 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8401 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
8402 }
8403
8404 StaleCI->eraseFromParent();
8405 for (Instruction *I : llvm::reverse(ToBeDeleted))
8406 I->eraseFromParent();
8407 };
8408 addOutlineInfo(std::move(OI));
8409
8410 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8411 << *(Builder.GetInsertBlock()) << "\n");
8412 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8413 << *(Builder.GetInsertBlock()->getParent()->getParent())
8414 << "\n");
8415 return Builder.saveIP();
8416}
8417
8418Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8419 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8420 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8421 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8422 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8423 if (Error Err =
8424 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8425 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8426 return Err;
8427 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8428 return Error::success();
8429}
8430
8431static void emitTargetCall(
8432 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8433 OpenMPIRBuilder::InsertPointTy AllocaIP,
8434 OpenMPIRBuilder::TargetDataInfo &Info,
8435 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8436 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8437 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8439 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8440 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8442 bool HasNoWait, Value *DynCGroupMem,
8443 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8444 // Generate a function call to the host fallback implementation of the target
8445 // region. This is called by the host when no offload entry was generated for
8446 // the target region and when the offloading call fails at runtime.
8447 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8448 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8449 Builder.restoreIP(IP);
8450 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
8451 return Builder.saveIP();
8452 };
8453
8454 bool HasDependencies = Dependencies.size() > 0;
8455 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8456
8457 OpenMPIRBuilder::TargetKernelArgs KArgs;
8458
8459 auto TaskBodyCB =
8460 [&](Value *DeviceID, Value *RTLoc,
8461 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8462 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8463 // produce any.
8464 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8465 // emitKernelLaunch makes the necessary runtime call to offload the
8466 // kernel. We then outline all that code into a separate function
8467 // ('kernel_launch_function' in the pseudo code above). This function is
8468 // then called by the target task proxy function (see
8469 // '@.omp_target_task_proxy_func' in the pseudo code above)
8470 // "@.omp_target_task_proxy_func' is generated by
8471 // emitTargetTaskProxyFunction.
8472 if (OutlinedFnID && DeviceID)
8473 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8474 EmitTargetCallFallbackCB, KArgs,
8475 DeviceID, RTLoc, TargetTaskAllocaIP);
8476
8477 // We only need to do the outlining if `DeviceID` is set to avoid calling
8478 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8479 // generating the `else` branch of an `if` clause.
8480 //
8481 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8482 // In this case, we execute the host implementation directly.
8483 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8484 }());
8485
8486 OMPBuilder.Builder.restoreIP(AfterIP);
8487 return Error::success();
8488 };
8489
8490 auto &&EmitTargetCallElse =
8491 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8492 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8493 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8494 // produce any.
8495 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8496 if (RequiresOuterTargetTask) {
8497 // Arguments that are intended to be directly forwarded to an
8498 // emitKernelLaunch call are pased as nullptr, since
8499 // OutlinedFnID=nullptr results in that call not being done.
8500 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8501 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8502 /*RTLoc=*/nullptr, AllocaIP,
8503 Dependencies, EmptyRTArgs, HasNoWait);
8504 }
8505 return EmitTargetCallFallbackCB(Builder.saveIP());
8506 }());
8507
8508 Builder.restoreIP(AfterIP);
8509 return Error::success();
8510 };
8511
8512 auto &&EmitTargetCallThen =
8513 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8514 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8515 Info.HasNoWait = HasNoWait;
8516 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8517 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8518 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8519 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8520 /*IsNonContiguous=*/true,
8521 /*ForEndCall=*/false))
8522 return Err;
8523
8524 SmallVector<Value *, 3> NumTeamsC;
8525 for (auto [DefaultVal, RuntimeVal] :
8526 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8527 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8528 : Builder.getInt32(DefaultVal));
8529
8530 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8531 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8532 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8533 if (Clause)
8534 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8535 /*isSigned=*/false);
8536 return Clause;
8537 };
8538 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8539 if (Clause)
8540 Result =
8541 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8542 Result, Clause)
8543 : Clause;
8544 };
8545
8546 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8547 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8548 SmallVector<Value *, 3> NumThreadsC;
8549 Value *MaxThreadsClause =
8550 RuntimeAttrs.TeamsThreadLimit.size() == 1
8551 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8552 : nullptr;
8553
8554 for (auto [TeamsVal, TargetVal] : zip_equal(
8555 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8556 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8557 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8558
8559 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8560 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8561
8562 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8563 }
8564
8565 unsigned NumTargetItems = Info.NumberOfPtrs;
8566 // TODO: Use correct device ID
8567 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8568 uint32_t SrcLocStrSize;
8569 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8570 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8571 llvm::omp::IdentFlag(0), 0);
8572
8573 Value *TripCount = RuntimeAttrs.LoopTripCount
8574 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8575 Builder.getInt64Ty(),
8576 /*isSigned=*/false)
8577 : Builder.getInt64(0);
8578
8579 // Request zero groupprivate bytes by default.
8580 if (!DynCGroupMem)
8581 DynCGroupMem = Builder.getInt32(0);
8582
8583 KArgs = OpenMPIRBuilder::TargetKernelArgs(
8584 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
8585 HasNoWait, DynCGroupMemFallback);
8586
8587 // Assume no error was returned because TaskBodyCB and
8588 // EmitTargetCallFallbackCB don't produce any.
8589 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8590 // The presence of certain clauses on the target directive require the
8591 // explicit generation of the target task.
8592 if (RequiresOuterTargetTask)
8593 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8594 Dependencies, KArgs.RTArgs,
8595 Info.HasNoWait);
8596
8597 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8598 EmitTargetCallFallbackCB, KArgs,
8599 DeviceID, RTLoc, AllocaIP);
8600 }());
8601
8602 Builder.restoreIP(AfterIP);
8603 return Error::success();
8604 };
8605
8606 // If we don't have an ID for the target region, it means an offload entry
8607 // wasn't created. In this case we just run the host fallback directly and
8608 // ignore any potential 'if' clauses.
8609 if (!OutlinedFnID) {
8610 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8611 return;
8612 }
8613
8614 // If there's no 'if' clause, only generate the kernel launch code path.
8615 if (!IfCond) {
8616 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8617 return;
8618 }
8619
8620 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8621 EmitTargetCallElse, AllocaIP));
8622}
8623
8624OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8625 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8626 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8627 TargetRegionEntryInfo &EntryInfo,
8628 const TargetKernelDefaultAttrs &DefaultAttrs,
8629 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8630 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8631 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8632 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8633 CustomMapperCallbackTy CustomMapperCB,
8634 const SmallVector<DependData> &Dependencies, bool HasNowait,
8635 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8636
8637 if (!updateToLocation(Loc))
8638 return InsertPointTy();
8639
8640 Builder.restoreIP(CodeGenIP);
8641
8642 Function *OutlinedFn;
8643 Constant *OutlinedFnID = nullptr;
8644 // The target region is outlined into its own function. The LLVM IR for
8645 // the target region itself is generated using the callbacks CBFunc
8646 // and ArgAccessorFuncCB
8648 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8649 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8650 return Err;
8651
8652 // If we are not on the target device, then we need to generate code
8653 // to make a remote call (offload) to the previously outlined function
8654 // that represents the target region. Do that now.
8655 if (!Config.isTargetDevice())
8656 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8657 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8658 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
8659 DynCGroupMemFallback);
8660 return Builder.saveIP();
8661}
8662
8663std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8664 StringRef FirstSeparator,
8665 StringRef Separator) {
8666 SmallString<128> Buffer;
8667 llvm::raw_svector_ostream OS(Buffer);
8668 StringRef Sep = FirstSeparator;
8669 for (StringRef Part : Parts) {
8670 OS << Sep << Part;
8671 Sep = Separator;
8672 }
8673 return OS.str().str();
8674}
8675
8676std::string
8677OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8678 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8679 Config.separator());
8680}
8681
8682GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
8683 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
8684 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8685 if (Elem.second) {
8686 assert(Elem.second->getValueType() == Ty &&
8687 "OMP internal variable has different type than requested");
8688 } else {
8689 // TODO: investigate the appropriate linkage type used for the global
8690 // variable for possibly changing that to internal or private, or maybe
8691 // create different versions of the function for different OMP internal
8692 // variables.
8693 const DataLayout &DL = M.getDataLayout();
8694 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
8695 // default global AS is 1.
8696 // See double-target-call-with-declare-target.f90 and
8697 // declare-target-vars-in-target-region.f90 libomptarget
8698 // tests.
8699 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
8700 : M.getTargetTriple().isAMDGPU()
8701 ? 0
8702 : DL.getDefaultGlobalsAddressSpace();
8703 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8706 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8707 Constant::getNullValue(Ty), Elem.first(),
8708 /*InsertBefore=*/nullptr,
8709 GlobalValue::NotThreadLocal, AddressSpaceVal);
8710 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8711 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
8712 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8713 Elem.second = GV;
8714 }
8715
8716 return Elem.second;
8717}
8718
8719Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8720 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8721 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8722 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8723}
8724
8725Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8726 LLVMContext &Ctx = Builder.getContext();
8727 Value *Null =
8729 Value *SizeGep =
8730 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8731 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8732 return SizePtrToInt;
8733}
8734
8736OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8737 std::string VarName) {
8738 llvm::Constant *MaptypesArrayInit =
8739 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8740 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8741 M, MaptypesArrayInit->getType(),
8742 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8743 VarName);
8744 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8745 return MaptypesArrayGlobal;
8746}
8747
8748void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8749 InsertPointTy AllocaIP,
8750 unsigned NumOperands,
8751 struct MapperAllocas &MapperAllocas) {
8752 if (!updateToLocation(Loc))
8753 return;
8754
8755 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8756 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8757 Builder.restoreIP(AllocaIP);
8758 AllocaInst *ArgsBase = Builder.CreateAlloca(
8759 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8760 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8761 ".offload_ptrs");
8762 AllocaInst *ArgSizes = Builder.CreateAlloca(
8763 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8764 updateToLocation(Loc);
8765 MapperAllocas.ArgsBase = ArgsBase;
8766 MapperAllocas.Args = Args;
8767 MapperAllocas.ArgSizes = ArgSizes;
8768}
8769
8770void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8771 Function *MapperFunc, Value *SrcLocInfo,
8772 Value *MaptypesArg, Value *MapnamesArg,
8773 struct MapperAllocas &MapperAllocas,
8774 int64_t DeviceID, unsigned NumOperands) {
8775 if (!updateToLocation(Loc))
8776 return;
8777
8778 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8779 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8780 Value *ArgsBaseGEP =
8781 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8782 {Builder.getInt32(0), Builder.getInt32(0)});
8783 Value *ArgsGEP =
8784 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8785 {Builder.getInt32(0), Builder.getInt32(0)});
8786 Value *ArgSizesGEP =
8787 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8788 {Builder.getInt32(0), Builder.getInt32(0)});
8789 Value *NullPtr =
8790 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8791 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
8792 Builder.getInt32(NumOperands),
8793 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
8794 MaptypesArg, MapnamesArg, NullPtr});
8795}
8796
8797void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8798 TargetDataRTArgs &RTArgs,
8799 TargetDataInfo &Info,
8800 bool ForEndCall) {
8801 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8802 "expected region end call to runtime only when end call is separate");
8803 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8804 auto VoidPtrTy = UnqualPtrTy;
8805 auto VoidPtrPtrTy = UnqualPtrTy;
8806 auto Int64Ty = Type::getInt64Ty(M.getContext());
8807 auto Int64PtrTy = UnqualPtrTy;
8808
8809 if (!Info.NumberOfPtrs) {
8810 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8811 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8812 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8813 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8814 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8815 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8816 return;
8817 }
8818
8819 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8820 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8821 Info.RTArgs.BasePointersArray,
8822 /*Idx0=*/0, /*Idx1=*/0);
8823 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8824 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8825 /*Idx0=*/0,
8826 /*Idx1=*/0);
8827 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8828 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8829 /*Idx0=*/0, /*Idx1=*/0);
8830 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8831 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8832 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8833 : Info.RTArgs.MapTypesArray,
8834 /*Idx0=*/0,
8835 /*Idx1=*/0);
8836
8837 // Only emit the mapper information arrays if debug information is
8838 // requested.
8839 if (!Info.EmitDebug)
8840 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8841 else
8842 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8843 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8844 /*Idx0=*/0,
8845 /*Idx1=*/0);
8846 // If there is no user-defined mapper, set the mapper array to nullptr to
8847 // avoid an unnecessary data privatization
8848 if (!Info.HasMapper)
8849 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8850 else
8851 RTArgs.MappersArray =
8852 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8853}
8854
8855void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8856 InsertPointTy CodeGenIP,
8857 MapInfosTy &CombinedInfo,
8858 TargetDataInfo &Info) {
8859 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8860 CombinedInfo.NonContigInfo;
8861
8862 // Build an array of struct descriptor_dim and then assign it to
8863 // offload_args.
8864 //
8865 // struct descriptor_dim {
8866 // uint64_t offset;
8867 // uint64_t count;
8868 // uint64_t stride
8869 // };
8870 Type *Int64Ty = Builder.getInt64Ty();
8872 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8873 "struct.descriptor_dim");
8874
8875 enum { OffsetFD = 0, CountFD, StrideFD };
8876 // We need two index variable here since the size of "Dims" is the same as
8877 // the size of Components, however, the size of offset, count, and stride is
8878 // equal to the size of base declaration that is non-contiguous.
8879 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8880 // Skip emitting ir if dimension size is 1 since it cannot be
8881 // non-contiguous.
8882 if (NonContigInfo.Dims[I] == 1)
8883 continue;
8884 Builder.restoreIP(AllocaIP);
8885 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8886 AllocaInst *DimsAddr =
8887 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8888 Builder.restoreIP(CodeGenIP);
8889 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8890 unsigned RevIdx = EE - II - 1;
8891 Value *DimsLVal = Builder.CreateInBoundsGEP(
8892 DimsAddr->getAllocatedType(), DimsAddr,
8893 {Builder.getInt64(0), Builder.getInt64(II)});
8894 // Offset
8895 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8896 Builder.CreateAlignedStore(
8897 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8898 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8899 // Count
8900 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8901 Builder.CreateAlignedStore(
8902 NonContigInfo.Counts[L][RevIdx], CountLVal,
8903 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8904 // Stride
8905 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8906 Builder.CreateAlignedStore(
8907 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8908 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8909 }
8910 // args[I] = &dims
8911 Builder.restoreIP(CodeGenIP);
8912 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8913 DimsAddr, Builder.getPtrTy());
8914 Value *P = Builder.CreateConstInBoundsGEP2_32(
8915 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8916 Info.RTArgs.PointersArray, 0, I);
8917 Builder.CreateAlignedStore(
8918 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8919 ++L;
8920 }
8921}
8922
8923void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8924 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8925 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8926 BasicBlock *ExitBB, bool IsInit) {
8927 StringRef Prefix = IsInit ? ".init" : ".del";
8928
8929 // Evaluate if this is an array section.
8931 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8932 Value *IsArray =
8933 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8934 Value *DeleteBit = Builder.CreateAnd(
8935 MapType,
8936 Builder.getInt64(
8937 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8938 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8939 Value *DeleteCond;
8940 Value *Cond;
8941 if (IsInit) {
8942 // base != begin?
8943 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8944 // IsPtrAndObj?
8945 Value *PtrAndObjBit = Builder.CreateAnd(
8946 MapType,
8947 Builder.getInt64(
8948 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8949 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8950 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8951 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8952 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8953 DeleteCond = Builder.CreateIsNull(
8954 DeleteBit,
8955 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8956 } else {
8957 Cond = IsArray;
8958 DeleteCond = Builder.CreateIsNotNull(
8959 DeleteBit,
8960 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8961 }
8962 Cond = Builder.CreateAnd(Cond, DeleteCond);
8963 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8964
8965 emitBlock(BodyBB, MapperFn);
8966 // Get the array size by multiplying element size and element number (i.e., \p
8967 // Size).
8968 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8969 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8970 // memory allocation/deletion purpose only.
8971 Value *MapTypeArg = Builder.CreateAnd(
8972 MapType,
8973 Builder.getInt64(
8974 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8975 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8976 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8977 MapTypeArg = Builder.CreateOr(
8978 MapTypeArg,
8979 Builder.getInt64(
8980 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8981 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8982
8983 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8984 // data structure.
8985 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8986 ArraySize, MapTypeArg, MapName};
8987 createRuntimeFunctionCall(
8988 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8989 OffloadingArgs);
8990}
8991
8992Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8993 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8994 llvm::Value *BeginArg)>
8995 GenMapInfoCB,
8996 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8997 SmallVector<Type *> Params;
8998 Params.emplace_back(Builder.getPtrTy());
8999 Params.emplace_back(Builder.getPtrTy());
9000 Params.emplace_back(Builder.getPtrTy());
9001 Params.emplace_back(Builder.getInt64Ty());
9002 Params.emplace_back(Builder.getInt64Ty());
9003 Params.emplace_back(Builder.getPtrTy());
9004
9005 auto *FnTy =
9006 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9007
9008 SmallString<64> TyStr;
9009 raw_svector_ostream Out(TyStr);
9010 Function *MapperFn =
9012 MapperFn->addFnAttr(Attribute::NoInline);
9013 MapperFn->addFnAttr(Attribute::NoUnwind);
9014 MapperFn->addParamAttr(0, Attribute::NoUndef);
9015 MapperFn->addParamAttr(1, Attribute::NoUndef);
9016 MapperFn->addParamAttr(2, Attribute::NoUndef);
9017 MapperFn->addParamAttr(3, Attribute::NoUndef);
9018 MapperFn->addParamAttr(4, Attribute::NoUndef);
9019 MapperFn->addParamAttr(5, Attribute::NoUndef);
9020
9021 // Start the mapper function code generation.
9022 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9023 auto SavedIP = Builder.saveIP();
9024 Builder.SetInsertPoint(EntryBB);
9025
9026 Value *MapperHandle = MapperFn->getArg(0);
9027 Value *BaseIn = MapperFn->getArg(1);
9028 Value *BeginIn = MapperFn->getArg(2);
9029 Value *Size = MapperFn->getArg(3);
9030 Value *MapType = MapperFn->getArg(4);
9031 Value *MapName = MapperFn->getArg(5);
9032
9033 // Compute the starting and end addresses of array elements.
9034 // Prepare common arguments for array initiation and deletion.
9035 // Convert the size in bytes into the number of array elements.
9036 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9037 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9038 Value *PtrBegin = BeginIn;
9039 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9040
9041 // Emit array initiation if this is an array section and \p MapType indicates
9042 // that memory allocation is required.
9043 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9044 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9045 MapType, MapName, ElementSize, HeadBB,
9046 /*IsInit=*/true);
9047
9048 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9049
9050 // Emit the loop header block.
9051 emitBlock(HeadBB, MapperFn);
9052 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9053 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9054 // Evaluate whether the initial condition is satisfied.
9055 Value *IsEmpty =
9056 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9057 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9058
9059 // Emit the loop body block.
9060 emitBlock(BodyBB, MapperFn);
9061 BasicBlock *LastBB = BodyBB;
9062 PHINode *PtrPHI =
9063 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9064 PtrPHI->addIncoming(PtrBegin, HeadBB);
9065
9066 // Get map clause information. Fill up the arrays with all mapped variables.
9067 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9068 if (!Info)
9069 return Info.takeError();
9070
9071 // Call the runtime API __tgt_mapper_num_components to get the number of
9072 // pre-existing components.
9073 Value *OffloadingArgs[] = {MapperHandle};
9074 Value *PreviousSize = createRuntimeFunctionCall(
9075 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9076 OffloadingArgs);
9077 Value *ShiftedPreviousSize =
9078 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9079
9080 // Fill up the runtime mapper handle for all components.
9081 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9082 Value *CurBaseArg = Info->BasePointers[I];
9083 Value *CurBeginArg = Info->Pointers[I];
9084 Value *CurSizeArg = Info->Sizes[I];
9085 Value *CurNameArg = Info->Names.size()
9086 ? Info->Names[I]
9087 : Constant::getNullValue(Builder.getPtrTy());
9088
9089 // Extract the MEMBER_OF field from the map type.
9090 Value *OriMapType = Builder.getInt64(
9091 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9092 Info->Types[I]));
9093 Value *MemberMapType =
9094 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9095
9096 // Combine the map type inherited from user-defined mapper with that
9097 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9098 // bits of the \a MapType, which is the input argument of the mapper
9099 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9100 // bits of MemberMapType.
9101 // [OpenMP 5.0], 1.2.6. map-type decay.
9102 // | alloc | to | from | tofrom | release | delete
9103 // ----------------------------------------------------------
9104 // alloc | alloc | alloc | alloc | alloc | release | delete
9105 // to | alloc | to | alloc | to | release | delete
9106 // from | alloc | alloc | from | from | release | delete
9107 // tofrom | alloc | to | from | tofrom | release | delete
9108 Value *LeftToFrom = Builder.CreateAnd(
9109 MapType,
9110 Builder.getInt64(
9111 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9112 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9113 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9114 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9115 BasicBlock *AllocElseBB =
9116 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9117 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9118 BasicBlock *ToElseBB =
9119 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9120 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9121 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9122 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9123 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9124 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9125 emitBlock(AllocBB, MapperFn);
9126 Value *AllocMapType = Builder.CreateAnd(
9127 MemberMapType,
9128 Builder.getInt64(
9129 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9130 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9131 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9132 Builder.CreateBr(EndBB);
9133 emitBlock(AllocElseBB, MapperFn);
9134 Value *IsTo = Builder.CreateICmpEQ(
9135 LeftToFrom,
9136 Builder.getInt64(
9137 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9138 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9139 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9140 // In case of to, clear OMP_MAP_FROM.
9141 emitBlock(ToBB, MapperFn);
9142 Value *ToMapType = Builder.CreateAnd(
9143 MemberMapType,
9144 Builder.getInt64(
9145 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9146 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9147 Builder.CreateBr(EndBB);
9148 emitBlock(ToElseBB, MapperFn);
9149 Value *IsFrom = Builder.CreateICmpEQ(
9150 LeftToFrom,
9151 Builder.getInt64(
9152 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9153 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9154 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9155 // In case of from, clear OMP_MAP_TO.
9156 emitBlock(FromBB, MapperFn);
9157 Value *FromMapType = Builder.CreateAnd(
9158 MemberMapType,
9159 Builder.getInt64(
9160 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9161 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9162 // In case of tofrom, do nothing.
9163 emitBlock(EndBB, MapperFn);
9164 LastBB = EndBB;
9165 PHINode *CurMapType =
9166 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9167 CurMapType->addIncoming(AllocMapType, AllocBB);
9168 CurMapType->addIncoming(ToMapType, ToBB);
9169 CurMapType->addIncoming(FromMapType, FromBB);
9170 CurMapType->addIncoming(MemberMapType, ToElseBB);
9171
9172 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9173 CurSizeArg, CurMapType, CurNameArg};
9174
9175 auto ChildMapperFn = CustomMapperCB(I);
9176 if (!ChildMapperFn)
9177 return ChildMapperFn.takeError();
9178 if (*ChildMapperFn) {
9179 // Call the corresponding mapper function.
9180 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9181 ->setDoesNotThrow();
9182 } else {
9183 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9184 // data structure.
9185 createRuntimeFunctionCall(
9186 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9187 OffloadingArgs);
9188 }
9189 }
9190
9191 // Update the pointer to point to the next element that needs to be mapped,
9192 // and check whether we have mapped all elements.
9193 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9194 "omp.arraymap.next");
9195 PtrPHI->addIncoming(PtrNext, LastBB);
9196 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9197 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9198 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9199
9200 emitBlock(ExitBB, MapperFn);
9201 // Emit array deletion if this is an array section and \p MapType indicates
9202 // that deletion is required.
9203 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9204 MapType, MapName, ElementSize, DoneBB,
9205 /*IsInit=*/false);
9206
9207 // Emit the function exit block.
9208 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9209
9210 Builder.CreateRetVoid();
9211 Builder.restoreIP(SavedIP);
9212 return MapperFn;
9213}
9214
9215Error OpenMPIRBuilder::emitOffloadingArrays(
9216 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9217 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9218 bool IsNonContiguous,
9219 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9220
9221 // Reset the array information.
9222 Info.clearArrayInfo();
9223 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9224
9225 if (Info.NumberOfPtrs == 0)
9226 return Error::success();
9227
9228 Builder.restoreIP(AllocaIP);
9229 // Detect if we have any capture size requiring runtime evaluation of the
9230 // size so that a constant array could be eventually used.
9231 ArrayType *PointerArrayType =
9232 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9233
9234 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9235 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9236
9237 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9238 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9239 AllocaInst *MappersArray = Builder.CreateAlloca(
9240 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9241 Info.RTArgs.MappersArray = MappersArray;
9242
9243 // If we don't have any VLA types or other types that require runtime
9244 // evaluation, we can use a constant array for the map sizes, otherwise we
9245 // need to fill up the arrays as we do for the pointers.
9246 Type *Int64Ty = Builder.getInt64Ty();
9247 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9248 ConstantInt::get(Int64Ty, 0));
9249 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9250 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9251 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9252 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9253 if (IsNonContiguous &&
9254 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9255 CombinedInfo.Types[I] &
9256 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9257 ConstSizes[I] =
9258 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9259 else
9260 ConstSizes[I] = CI;
9261 continue;
9262 }
9263 }
9264 RuntimeSizes.set(I);
9265 }
9266
9267 if (RuntimeSizes.all()) {
9268 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9269 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9270 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9271 restoreIPandDebugLoc(Builder, CodeGenIP);
9272 } else {
9273 auto *SizesArrayInit = ConstantArray::get(
9274 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9275 std::string Name = createPlatformSpecificName({"offload_sizes"});
9276 auto *SizesArrayGbl =
9277 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9278 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9279 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9280
9281 if (!RuntimeSizes.any()) {
9282 Info.RTArgs.SizesArray = SizesArrayGbl;
9283 } else {
9284 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9285 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9286 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9287 AllocaInst *Buffer = Builder.CreateAlloca(
9288 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9289 Buffer->setAlignment(OffloadSizeAlign);
9290 restoreIPandDebugLoc(Builder, CodeGenIP);
9291 Builder.CreateMemCpy(
9292 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9293 SizesArrayGbl, OffloadSizeAlign,
9294 Builder.getIntN(
9295 IndexSize,
9296 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9297
9298 Info.RTArgs.SizesArray = Buffer;
9299 }
9300 restoreIPandDebugLoc(Builder, CodeGenIP);
9301 }
9302
9303 // The map types are always constant so we don't need to generate code to
9304 // fill arrays. Instead, we create an array constant.
9306 for (auto mapFlag : CombinedInfo.Types)
9307 Mapping.push_back(
9308 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9309 mapFlag));
9310 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9311 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9312 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9313
9314 // The information types are only built if provided.
9315 if (!CombinedInfo.Names.empty()) {
9316 auto *MapNamesArrayGbl = createOffloadMapnames(
9317 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9318 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9319 Info.EmitDebug = true;
9320 } else {
9321 Info.RTArgs.MapNamesArray =
9322 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9323 Info.EmitDebug = false;
9324 }
9325
9326 // If there's a present map type modifier, it must not be applied to the end
9327 // of a region, so generate a separate map type array in that case.
9328 if (Info.separateBeginEndCalls()) {
9329 bool EndMapTypesDiffer = false;
9330 for (uint64_t &Type : Mapping) {
9331 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9332 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9333 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9334 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9335 EndMapTypesDiffer = true;
9336 }
9337 }
9338 if (EndMapTypesDiffer) {
9339 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9340 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9341 }
9342 }
9343
9344 PointerType *PtrTy = Builder.getPtrTy();
9345 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9346 Value *BPVal = CombinedInfo.BasePointers[I];
9347 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9348 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9349 0, I);
9350 Builder.CreateAlignedStore(BPVal, BP,
9351 M.getDataLayout().getPrefTypeAlign(PtrTy));
9352
9353 if (Info.requiresDevicePointerInfo()) {
9354 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9355 CodeGenIP = Builder.saveIP();
9356 Builder.restoreIP(AllocaIP);
9357 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9358 Builder.restoreIP(CodeGenIP);
9359 if (DeviceAddrCB)
9360 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9361 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9362 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9363 if (DeviceAddrCB)
9364 DeviceAddrCB(I, BP);
9365 }
9366 }
9367
9368 Value *PVal = CombinedInfo.Pointers[I];
9369 Value *P = Builder.CreateConstInBoundsGEP2_32(
9370 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9371 I);
9372 // TODO: Check alignment correct.
9373 Builder.CreateAlignedStore(PVal, P,
9374 M.getDataLayout().getPrefTypeAlign(PtrTy));
9375
9376 if (RuntimeSizes.test(I)) {
9377 Value *S = Builder.CreateConstInBoundsGEP2_32(
9378 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9379 /*Idx0=*/0,
9380 /*Idx1=*/I);
9381 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9382 Int64Ty,
9383 /*isSigned=*/true),
9384 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9385 }
9386 // Fill up the mapper array.
9387 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9388 Value *MFunc = ConstantPointerNull::get(PtrTy);
9389
9390 auto CustomMFunc = CustomMapperCB(I);
9391 if (!CustomMFunc)
9392 return CustomMFunc.takeError();
9393 if (*CustomMFunc)
9394 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9395
9396 Value *MAddr = Builder.CreateInBoundsGEP(
9397 MappersArray->getAllocatedType(), MappersArray,
9398 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9399 Builder.CreateAlignedStore(
9400 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9401 }
9402
9403 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9404 Info.NumberOfPtrs == 0)
9405 return Error::success();
9406 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9407 return Error::success();
9408}
9409
9410void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9411 BasicBlock *CurBB = Builder.GetInsertBlock();
9412
9413 if (!CurBB || CurBB->getTerminator()) {
9414 // If there is no insert point or the previous block is already
9415 // terminated, don't touch it.
9416 } else {
9417 // Otherwise, create a fall-through branch.
9418 Builder.CreateBr(Target);
9419 }
9420
9421 Builder.ClearInsertionPoint();
9422}
9423
9424void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9425 bool IsFinished) {
9426 BasicBlock *CurBB = Builder.GetInsertBlock();
9427
9428 // Fall out of the current block (if necessary).
9429 emitBranch(BB);
9430
9431 if (IsFinished && BB->use_empty()) {
9432 BB->eraseFromParent();
9433 return;
9434 }
9435
9436 // Place the block after the current block, if possible, or else at
9437 // the end of the function.
9438 if (CurBB && CurBB->getParent())
9439 CurFn->insert(std::next(CurBB->getIterator()), BB);
9440 else
9441 CurFn->insert(CurFn->end(), BB);
9442 Builder.SetInsertPoint(BB);
9443}
9444
9445Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9446 BodyGenCallbackTy ElseGen,
9447 InsertPointTy AllocaIP) {
9448 // If the condition constant folds and can be elided, try to avoid emitting
9449 // the condition and the dead arm of the if/else.
9450 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9451 auto CondConstant = CI->getSExtValue();
9452 if (CondConstant)
9453 return ThenGen(AllocaIP, Builder.saveIP());
9454
9455 return ElseGen(AllocaIP, Builder.saveIP());
9456 }
9457
9458 Function *CurFn = Builder.GetInsertBlock()->getParent();
9459
9460 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9461 // emit the conditional branch.
9462 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9463 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9464 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9465 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9466 // Emit the 'then' code.
9467 emitBlock(ThenBlock, CurFn);
9468 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9469 return Err;
9470 emitBranch(ContBlock);
9471 // Emit the 'else' code if present.
9472 // There is no need to emit line number for unconditional branch.
9473 emitBlock(ElseBlock, CurFn);
9474 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9475 return Err;
9476 // There is no need to emit line number for unconditional branch.
9477 emitBranch(ContBlock);
9478 // Emit the continuation block for code after the if.
9479 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9480 return Error::success();
9481}
9482
9483bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9484 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9487 "Unexpected Atomic Ordering.");
9488
9489 bool Flush = false;
9491
9492 switch (AK) {
9493 case Read:
9496 FlushAO = AtomicOrdering::Acquire;
9497 Flush = true;
9498 }
9499 break;
9500 case Write:
9501 case Compare:
9502 case Update:
9505 FlushAO = AtomicOrdering::Release;
9506 Flush = true;
9507 }
9508 break;
9509 case Capture:
9510 switch (AO) {
9512 FlushAO = AtomicOrdering::Acquire;
9513 Flush = true;
9514 break;
9516 FlushAO = AtomicOrdering::Release;
9517 Flush = true;
9518 break;
9522 Flush = true;
9523 break;
9524 default:
9525 // do nothing - leave silently.
9526 break;
9527 }
9528 }
9529
9530 if (Flush) {
9531 // Currently Flush RT call still doesn't take memory_ordering, so for when
9532 // that happens, this tries to do the resolution of which atomic ordering
9533 // to use with but issue the flush call
9534 // TODO: pass `FlushAO` after memory ordering support is added
9535 (void)FlushAO;
9536 emitFlush(Loc);
9537 }
9538
9539 // for AO == AtomicOrdering::Monotonic and all other case combinations
9540 // do nothing
9541 return Flush;
9542}
9543
9544OpenMPIRBuilder::InsertPointTy
9545OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9546 AtomicOpValue &X, AtomicOpValue &V,
9547 AtomicOrdering AO, InsertPointTy AllocaIP) {
9548 if (!updateToLocation(Loc))
9549 return Loc.IP;
9550
9551 assert(X.Var->getType()->isPointerTy() &&
9552 "OMP Atomic expects a pointer to target memory");
9553 Type *XElemTy = X.ElemTy;
9554 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9555 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9556 "OMP atomic read expected a scalar type");
9557
9558 Value *XRead = nullptr;
9559
9560 if (XElemTy->isIntegerTy()) {
9561 LoadInst *XLD =
9562 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9563 XLD->setAtomic(AO);
9564 XRead = cast<Value>(XLD);
9565 } else if (XElemTy->isStructTy()) {
9566 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9567 // target does not support `atomicrmw` of the size of the struct
9568 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9569 OldVal->setAtomic(AO);
9570 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9571 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9572 OpenMPIRBuilder::AtomicInfo atomicInfo(
9573 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9574 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9575 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9576 XRead = AtomicLoadRes.first;
9577 OldVal->eraseFromParent();
9578 } else {
9579 // We need to perform atomic op as integer
9580 IntegerType *IntCastTy =
9581 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9582 LoadInst *XLoad =
9583 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9584 XLoad->setAtomic(AO);
9585 if (XElemTy->isFloatingPointTy()) {
9586 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9587 } else {
9588 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9589 }
9590 }
9591 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9592 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9593 return Builder.saveIP();
9594}
9595
9596OpenMPIRBuilder::InsertPointTy
9597OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9598 AtomicOpValue &X, Value *Expr,
9599 AtomicOrdering AO, InsertPointTy AllocaIP) {
9600 if (!updateToLocation(Loc))
9601 return Loc.IP;
9602
9603 assert(X.Var->getType()->isPointerTy() &&
9604 "OMP Atomic expects a pointer to target memory");
9605 Type *XElemTy = X.ElemTy;
9606 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9607 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9608 "OMP atomic write expected a scalar type");
9609
9610 if (XElemTy->isIntegerTy()) {
9611 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9612 XSt->setAtomic(AO);
9613 } else if (XElemTy->isStructTy()) {
9614 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9615 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9616 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9617 OpenMPIRBuilder::AtomicInfo atomicInfo(
9618 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9619 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9620 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9621 OldVal->eraseFromParent();
9622 } else {
9623 // We need to bitcast and perform atomic op as integers
9624 IntegerType *IntCastTy =
9625 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9626 Value *ExprCast =
9627 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9628 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9629 XSt->setAtomic(AO);
9630 }
9631
9632 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9633 return Builder.saveIP();
9634}
9635
9636OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9637 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9638 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9639 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9640 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9641 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9642 if (!updateToLocation(Loc))
9643 return Loc.IP;
9644
9645 LLVM_DEBUG({
9646 Type *XTy = X.Var->getType();
9647 assert(XTy->isPointerTy() &&
9648 "OMP Atomic expects a pointer to target memory");
9649 Type *XElemTy = X.ElemTy;
9650 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9651 XElemTy->isPointerTy()) &&
9652 "OMP atomic update expected a scalar type");
9653 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9654 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9655 "OpenMP atomic does not support LT or GT operations");
9656 });
9657
9658 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9659 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9660 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9661 if (!AtomicResult)
9662 return AtomicResult.takeError();
9663 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9664 return Builder.saveIP();
9665}
9666
9667// FIXME: Duplicating AtomicExpand
9668Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9669 AtomicRMWInst::BinOp RMWOp) {
9670 switch (RMWOp) {
9671 case AtomicRMWInst::Add:
9672 return Builder.CreateAdd(Src1, Src2);
9673 case AtomicRMWInst::Sub:
9674 return Builder.CreateSub(Src1, Src2);
9675 case AtomicRMWInst::And:
9676 return Builder.CreateAnd(Src1, Src2);
9678 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9679 case AtomicRMWInst::Or:
9680 return Builder.CreateOr(Src1, Src2);
9681 case AtomicRMWInst::Xor:
9682 return Builder.CreateXor(Src1, Src2);
9687 case AtomicRMWInst::Max:
9688 case AtomicRMWInst::Min:
9699 llvm_unreachable("Unsupported atomic update operation");
9700 }
9701 llvm_unreachable("Unsupported atomic update operation");
9702}
9703
9704Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9705 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9707 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9708 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9709 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9710 // or a complex datatype.
9711 bool emitRMWOp = false;
9712 switch (RMWOp) {
9713 case AtomicRMWInst::Add:
9714 case AtomicRMWInst::And:
9716 case AtomicRMWInst::Or:
9717 case AtomicRMWInst::Xor:
9719 emitRMWOp = XElemTy;
9720 break;
9721 case AtomicRMWInst::Sub:
9722 emitRMWOp = (IsXBinopExpr && XElemTy);
9723 break;
9724 default:
9725 emitRMWOp = false;
9726 }
9727 emitRMWOp &= XElemTy->isIntegerTy();
9728
9729 std::pair<Value *, Value *> Res;
9730 if (emitRMWOp) {
9731 AtomicRMWInst *RMWInst =
9732 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9733 if (T.isAMDGPU()) {
9734 if (IsIgnoreDenormalMode)
9735 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9736 llvm::MDNode::get(Builder.getContext(), {}));
9737 if (!IsFineGrainedMemory)
9738 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9739 llvm::MDNode::get(Builder.getContext(), {}));
9740 if (!IsRemoteMemory)
9741 RMWInst->setMetadata("amdgpu.no.remote.memory",
9742 llvm::MDNode::get(Builder.getContext(), {}));
9743 }
9744 Res.first = RMWInst;
9745 // not needed except in case of postfix captures. Generate anyway for
9746 // consistency with the else part. Will be removed with any DCE pass.
9747 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9748 if (RMWOp == AtomicRMWInst::Xchg)
9749 Res.second = Res.first;
9750 else
9751 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9752 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9753 XElemTy->isStructTy()) {
9754 LoadInst *OldVal =
9755 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9756 OldVal->setAtomic(AO);
9757 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9758 unsigned LoadSize =
9759 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9760
9761 OpenMPIRBuilder::AtomicInfo atomicInfo(
9762 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9763 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9764 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9765 BasicBlock *CurBB = Builder.GetInsertBlock();
9766 Instruction *CurBBTI = CurBB->getTerminator();
9767 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9768 BasicBlock *ExitBB =
9769 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9770 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9771 X->getName() + ".atomic.cont");
9772 ContBB->getTerminator()->eraseFromParent();
9773 Builder.restoreIP(AllocaIP);
9774 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9775 NewAtomicAddr->setName(X->getName() + "x.new.val");
9776 Builder.SetInsertPoint(ContBB);
9777 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9778 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9779 Value *OldExprVal = PHI;
9780 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9781 if (!CBResult)
9782 return CBResult.takeError();
9783 Value *Upd = *CBResult;
9784 Builder.CreateStore(Upd, NewAtomicAddr);
9787 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9788 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9789 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9790 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9791 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9792 OldVal->eraseFromParent();
9793 Res.first = OldExprVal;
9794 Res.second = Upd;
9795
9796 if (UnreachableInst *ExitTI =
9798 CurBBTI->eraseFromParent();
9799 Builder.SetInsertPoint(ExitBB);
9800 } else {
9801 Builder.SetInsertPoint(ExitTI);
9802 }
9803 } else {
9804 IntegerType *IntCastTy =
9805 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9806 LoadInst *OldVal =
9807 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9808 OldVal->setAtomic(AO);
9809 // CurBB
9810 // | /---\
9811 // ContBB |
9812 // | \---/
9813 // ExitBB
9814 BasicBlock *CurBB = Builder.GetInsertBlock();
9815 Instruction *CurBBTI = CurBB->getTerminator();
9816 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9817 BasicBlock *ExitBB =
9818 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9819 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9820 X->getName() + ".atomic.cont");
9821 ContBB->getTerminator()->eraseFromParent();
9822 Builder.restoreIP(AllocaIP);
9823 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9824 NewAtomicAddr->setName(X->getName() + "x.new.val");
9825 Builder.SetInsertPoint(ContBB);
9826 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9827 PHI->addIncoming(OldVal, CurBB);
9828 bool IsIntTy = XElemTy->isIntegerTy();
9829 Value *OldExprVal = PHI;
9830 if (!IsIntTy) {
9831 if (XElemTy->isFloatingPointTy()) {
9832 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9833 X->getName() + ".atomic.fltCast");
9834 } else {
9835 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9836 X->getName() + ".atomic.ptrCast");
9837 }
9838 }
9839
9840 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9841 if (!CBResult)
9842 return CBResult.takeError();
9843 Value *Upd = *CBResult;
9844 Builder.CreateStore(Upd, NewAtomicAddr);
9845 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9848 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9849 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9850 Result->setVolatile(VolatileX);
9851 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9852 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9853 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9854 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9855
9856 Res.first = OldExprVal;
9857 Res.second = Upd;
9858
9859 // set Insertion point in exit block
9860 if (UnreachableInst *ExitTI =
9862 CurBBTI->eraseFromParent();
9863 Builder.SetInsertPoint(ExitBB);
9864 } else {
9865 Builder.SetInsertPoint(ExitTI);
9866 }
9867 }
9868
9869 return Res;
9870}
9871
9872OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9873 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9874 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9875 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9876 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9877 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9878 if (!updateToLocation(Loc))
9879 return Loc.IP;
9880
9881 LLVM_DEBUG({
9882 Type *XTy = X.Var->getType();
9883 assert(XTy->isPointerTy() &&
9884 "OMP Atomic expects a pointer to target memory");
9885 Type *XElemTy = X.ElemTy;
9886 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9887 XElemTy->isPointerTy()) &&
9888 "OMP atomic capture expected a scalar type");
9889 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9890 "OpenMP atomic does not support LT or GT operations");
9891 });
9892
9893 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9894 // 'x' is simply atomically rewritten with 'expr'.
9895 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9896 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9897 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9898 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9899 if (!AtomicResult)
9900 return AtomicResult.takeError();
9901 Value *CapturedVal =
9902 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9903 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9904
9905 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9906 return Builder.saveIP();
9907}
9908
9909OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9910 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9911 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9912 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9913 bool IsFailOnly) {
9914
9916 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9917 IsPostfixUpdate, IsFailOnly, Failure);
9918}
9919
9920OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9921 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9922 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9923 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9924 bool IsFailOnly, AtomicOrdering Failure) {
9925
9926 if (!updateToLocation(Loc))
9927 return Loc.IP;
9928
9929 assert(X.Var->getType()->isPointerTy() &&
9930 "OMP atomic expects a pointer to target memory");
9931 // compare capture
9932 if (V.Var) {
9933 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9934 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9935 }
9936
9937 bool IsInteger = E->getType()->isIntegerTy();
9938
9939 if (Op == OMPAtomicCompareOp::EQ) {
9940 AtomicCmpXchgInst *Result = nullptr;
9941 if (!IsInteger) {
9942 IntegerType *IntCastTy =
9943 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9944 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9945 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9946 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9947 AO, Failure);
9948 } else {
9949 Result =
9950 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9951 }
9952
9953 if (V.Var) {
9954 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9955 if (!IsInteger)
9956 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9957 assert(OldValue->getType() == V.ElemTy &&
9958 "OldValue and V must be of same type");
9959 if (IsPostfixUpdate) {
9960 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9961 } else {
9962 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9963 if (IsFailOnly) {
9964 // CurBB----
9965 // | |
9966 // v |
9967 // ContBB |
9968 // | |
9969 // v |
9970 // ExitBB <-
9971 //
9972 // where ContBB only contains the store of old value to 'v'.
9973 BasicBlock *CurBB = Builder.GetInsertBlock();
9974 Instruction *CurBBTI = CurBB->getTerminator();
9975 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9976 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9977 CurBBTI, X.Var->getName() + ".atomic.exit");
9978 BasicBlock *ContBB = CurBB->splitBasicBlock(
9979 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9980 ContBB->getTerminator()->eraseFromParent();
9981 CurBB->getTerminator()->eraseFromParent();
9982
9983 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9984
9985 Builder.SetInsertPoint(ContBB);
9986 Builder.CreateStore(OldValue, V.Var);
9987 Builder.CreateBr(ExitBB);
9988
9989 if (UnreachableInst *ExitTI =
9991 CurBBTI->eraseFromParent();
9992 Builder.SetInsertPoint(ExitBB);
9993 } else {
9994 Builder.SetInsertPoint(ExitTI);
9995 }
9996 } else {
9997 Value *CapturedValue =
9998 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9999 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10000 }
10001 }
10002 }
10003 // The comparison result has to be stored.
10004 if (R.Var) {
10005 assert(R.Var->getType()->isPointerTy() &&
10006 "r.var must be of pointer type");
10007 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10008
10009 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10010 Value *ResultCast = R.IsSigned
10011 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10012 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10013 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10014 }
10015 } else {
10016 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10017 "Op should be either max or min at this point");
10018 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10019
10020 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10021 // Let's take max as example.
10022 // OpenMP form:
10023 // x = x > expr ? expr : x;
10024 // LLVM form:
10025 // *ptr = *ptr > val ? *ptr : val;
10026 // We need to transform to LLVM form.
10027 // x = x <= expr ? x : expr;
10029 if (IsXBinopExpr) {
10030 if (IsInteger) {
10031 if (X.IsSigned)
10032 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10034 else
10035 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10037 } else {
10038 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10040 }
10041 } else {
10042 if (IsInteger) {
10043 if (X.IsSigned)
10044 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10046 else
10047 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10049 } else {
10050 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10052 }
10053 }
10054
10055 AtomicRMWInst *OldValue =
10056 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10057 if (V.Var) {
10058 Value *CapturedValue = nullptr;
10059 if (IsPostfixUpdate) {
10060 CapturedValue = OldValue;
10061 } else {
10062 CmpInst::Predicate Pred;
10063 switch (NewOp) {
10064 case AtomicRMWInst::Max:
10065 Pred = CmpInst::ICMP_SGT;
10066 break;
10068 Pred = CmpInst::ICMP_UGT;
10069 break;
10071 Pred = CmpInst::FCMP_OGT;
10072 break;
10073 case AtomicRMWInst::Min:
10074 Pred = CmpInst::ICMP_SLT;
10075 break;
10077 Pred = CmpInst::ICMP_ULT;
10078 break;
10080 Pred = CmpInst::FCMP_OLT;
10081 break;
10082 default:
10083 llvm_unreachable("unexpected comparison op");
10084 }
10085 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10086 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10087 }
10088 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10089 }
10090 }
10091
10092 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10093
10094 return Builder.saveIP();
10095}
10096
10097OpenMPIRBuilder::InsertPointOrErrorTy
10098OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
10099 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10100 Value *NumTeamsUpper, Value *ThreadLimit,
10101 Value *IfExpr) {
10102 if (!updateToLocation(Loc))
10103 return InsertPointTy();
10104
10105 uint32_t SrcLocStrSize;
10106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10108 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10109
10110 // Outer allocation basicblock is the entry block of the current function.
10111 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10112 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10113 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10114 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10115 }
10116
10117 // The current basic block is split into four basic blocks. After outlining,
10118 // they will be mapped as follows:
10119 // ```
10120 // def current_fn() {
10121 // current_basic_block:
10122 // br label %teams.exit
10123 // teams.exit:
10124 // ; instructions after teams
10125 // }
10126 //
10127 // def outlined_fn() {
10128 // teams.alloca:
10129 // br label %teams.body
10130 // teams.body:
10131 // ; instructions within teams body
10132 // }
10133 // ```
10134 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10135 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10136 BasicBlock *AllocaBB =
10137 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10138
10139 bool SubClausesPresent =
10140 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10141 // Push num_teams
10142 if (!Config.isTargetDevice() && SubClausesPresent) {
10143 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10144 "if lowerbound is non-null, then upperbound must also be non-null "
10145 "for bounds on num_teams");
10146
10147 if (NumTeamsUpper == nullptr)
10148 NumTeamsUpper = Builder.getInt32(0);
10149
10150 if (NumTeamsLower == nullptr)
10151 NumTeamsLower = NumTeamsUpper;
10152
10153 if (IfExpr) {
10154 assert(IfExpr->getType()->isIntegerTy() &&
10155 "argument to if clause must be an integer value");
10156
10157 // upper = ifexpr ? upper : 1
10158 if (IfExpr->getType() != Int1)
10159 IfExpr = Builder.CreateICmpNE(IfExpr,
10160 ConstantInt::get(IfExpr->getType(), 0));
10161 NumTeamsUpper = Builder.CreateSelect(
10162 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10163
10164 // lower = ifexpr ? lower : 1
10165 NumTeamsLower = Builder.CreateSelect(
10166 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10167 }
10168
10169 if (ThreadLimit == nullptr)
10170 ThreadLimit = Builder.getInt32(0);
10171
10172 Value *ThreadNum = getOrCreateThreadID(Ident);
10173 createRuntimeFunctionCall(
10174 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10175 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
10176 }
10177 // Generate the body of teams.
10178 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10179 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10180 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10181 return Err;
10182
10183 OutlineInfo OI;
10184 OI.EntryBB = AllocaBB;
10185 OI.ExitBB = ExitBB;
10186 OI.OuterAllocaBB = &OuterAllocaBB;
10187
10188 // Insert fake values for global tid and bound tid.
10190 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10191 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10192 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10193 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10194 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10195
10196 auto HostPostOutlineCB = [this, Ident,
10197 ToBeDeleted](Function &OutlinedFn) mutable {
10198 // The stale call instruction will be replaced with a new call instruction
10199 // for runtime call with the outlined function.
10200
10201 assert(OutlinedFn.hasOneUse() &&
10202 "there must be a single user for the outlined function");
10203 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10204 ToBeDeleted.push_back(StaleCI);
10205
10206 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10207 "Outlined function must have two or three arguments only");
10208
10209 bool HasShared = OutlinedFn.arg_size() == 3;
10210
10211 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10212 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10213 if (HasShared)
10214 OutlinedFn.getArg(2)->setName("data");
10215
10216 // Call to the runtime function for teams in the current function.
10217 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10218 "outlined function.");
10219 Builder.SetInsertPoint(StaleCI);
10221 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10222 if (HasShared)
10223 Args.push_back(StaleCI->getArgOperand(2));
10224 createRuntimeFunctionCall(
10225 getOrCreateRuntimeFunctionPtr(
10226 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10227 Args);
10228
10229 for (Instruction *I : llvm::reverse(ToBeDeleted))
10230 I->eraseFromParent();
10231 };
10232
10233 if (!Config.isTargetDevice())
10234 OI.PostOutlineCB = HostPostOutlineCB;
10235
10236 addOutlineInfo(std::move(OI));
10237
10238 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10239
10240 return Builder.saveIP();
10241}
10242
10243OpenMPIRBuilder::InsertPointOrErrorTy
10244OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10245 InsertPointTy OuterAllocaIP,
10246 BodyGenCallbackTy BodyGenCB) {
10247 if (!updateToLocation(Loc))
10248 return InsertPointTy();
10249
10250 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10251
10252 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10253 BasicBlock *BodyBB =
10254 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10255 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10256 }
10257 BasicBlock *ExitBB =
10258 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10259 BasicBlock *BodyBB =
10260 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10261 BasicBlock *AllocaBB =
10262 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10263
10264 // Generate the body of distribute clause
10265 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10266 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10267 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10268 return Err;
10269
10270 // When using target we use different runtime functions which require a
10271 // callback.
10272 if (Config.isTargetDevice()) {
10273 OutlineInfo OI;
10274 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10275 OI.EntryBB = AllocaBB;
10276 OI.ExitBB = ExitBB;
10277
10278 addOutlineInfo(std::move(OI));
10279 }
10280 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10281
10282 return Builder.saveIP();
10283}
10284
10286OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10287 std::string VarName) {
10288 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10290 Names.size()),
10291 Names);
10292 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10293 M, MapNamesArrayInit->getType(),
10294 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10295 VarName);
10296 return MapNamesArrayGlobal;
10297}
10298
10299// Create all simple and struct types exposed by the runtime and remember
10300// the llvm::PointerTypes of them for easy access later.
10301void OpenMPIRBuilder::initializeTypes(Module &M) {
10302 LLVMContext &Ctx = M.getContext();
10303 StructType *T;
10304 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10305 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10306#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10307#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10308 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10309 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10310#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10311 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10312 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10313#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10314 T = StructType::getTypeByName(Ctx, StructName); \
10315 if (!T) \
10316 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10317 VarName = T; \
10318 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10319#include "llvm/Frontend/OpenMP/OMPKinds.def"
10320}
10321
10322void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10324 SmallVectorImpl<BasicBlock *> &BlockVector) {
10326 BlockSet.insert(EntryBB);
10327 BlockSet.insert(ExitBB);
10328
10329 Worklist.push_back(EntryBB);
10330 while (!Worklist.empty()) {
10331 BasicBlock *BB = Worklist.pop_back_val();
10332 BlockVector.push_back(BB);
10333 for (BasicBlock *SuccBB : successors(BB))
10334 if (BlockSet.insert(SuccBB).second)
10335 Worklist.push_back(SuccBB);
10336 }
10337}
10338
10339void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10340 uint64_t Size, int32_t Flags,
10342 StringRef Name) {
10343 if (!Config.isGPU()) {
10346 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10347 return;
10348 }
10349 // TODO: Add support for global variables on the device after declare target
10350 // support.
10351 Function *Fn = dyn_cast<Function>(Addr);
10352 if (!Fn)
10353 return;
10354
10355 // Add a function attribute for the kernel.
10356 Fn->addFnAttr("kernel");
10357 if (T.isAMDGCN())
10358 Fn->addFnAttr("uniform-work-group-size", "true");
10359 Fn->addFnAttr(Attribute::MustProgress);
10360}
10361
10362// We only generate metadata for function that contain target regions.
10363void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10364 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10365
10366 // If there are no entries, we don't need to do anything.
10367 if (OffloadInfoManager.empty())
10368 return;
10369
10370 LLVMContext &C = M.getContext();
10371 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10372 TargetRegionEntryInfo>,
10373 16>
10374 OrderedEntries(OffloadInfoManager.size());
10375
10376 // Auxiliary methods to create metadata values and strings.
10377 auto &&GetMDInt = [this](unsigned V) {
10378 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10379 };
10380
10381 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10382
10383 // Create the offloading info metadata node.
10384 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10385 auto &&TargetRegionMetadataEmitter =
10386 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10387 const TargetRegionEntryInfo &EntryInfo,
10388 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10389 // Generate metadata for target regions. Each entry of this metadata
10390 // contains:
10391 // - Entry 0 -> Kind of this type of metadata (0).
10392 // - Entry 1 -> Device ID of the file where the entry was identified.
10393 // - Entry 2 -> File ID of the file where the entry was identified.
10394 // - Entry 3 -> Mangled name of the function where the entry was
10395 // identified.
10396 // - Entry 4 -> Line in the file where the entry was identified.
10397 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10398 // - Entry 6 -> Order the entry was created.
10399 // The first element of the metadata node is the kind.
10400 Metadata *Ops[] = {
10401 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10402 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10403 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10404 GetMDInt(E.getOrder())};
10405
10406 // Save this entry in the right position of the ordered entries array.
10407 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10408
10409 // Add metadata to the named metadata node.
10410 MD->addOperand(MDNode::get(C, Ops));
10411 };
10412
10413 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10414
10415 // Create function that emits metadata for each device global variable entry;
10416 auto &&DeviceGlobalVarMetadataEmitter =
10417 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10418 StringRef MangledName,
10419 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10420 // Generate metadata for global variables. Each entry of this metadata
10421 // contains:
10422 // - Entry 0 -> Kind of this type of metadata (1).
10423 // - Entry 1 -> Mangled name of the variable.
10424 // - Entry 2 -> Declare target kind.
10425 // - Entry 3 -> Order the entry was created.
10426 // The first element of the metadata node is the kind.
10427 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10428 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10429
10430 // Save this entry in the right position of the ordered entries array.
10431 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10432 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10433
10434 // Add metadata to the named metadata node.
10435 MD->addOperand(MDNode::get(C, Ops));
10436 };
10437
10438 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10439 DeviceGlobalVarMetadataEmitter);
10440
10441 for (const auto &E : OrderedEntries) {
10442 assert(E.first && "All ordered entries must exist!");
10443 if (const auto *CE =
10445 E.first)) {
10446 if (!CE->getID() || !CE->getAddress()) {
10447 // Do not blame the entry if the parent funtion is not emitted.
10448 TargetRegionEntryInfo EntryInfo = E.second;
10449 StringRef FnName = EntryInfo.ParentName;
10450 if (!M.getNamedValue(FnName))
10451 continue;
10452 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10453 continue;
10454 }
10455 createOffloadEntry(CE->getID(), CE->getAddress(),
10456 /*Size=*/0, CE->getFlags(),
10458 } else if (const auto *CE = dyn_cast<
10459 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10460 E.first)) {
10461 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10462 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10463 CE->getFlags());
10464 switch (Flags) {
10465 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10466 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10467 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10468 continue;
10469 if (!CE->getAddress()) {
10470 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10471 continue;
10472 }
10473 // The vaiable has no definition - no need to add the entry.
10474 if (CE->getVarSize() == 0)
10475 continue;
10476 break;
10477 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10478 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10479 (!Config.isTargetDevice() && CE->getAddress())) &&
10480 "Declaret target link address is set.");
10481 if (Config.isTargetDevice())
10482 continue;
10483 if (!CE->getAddress()) {
10484 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10485 continue;
10486 }
10487 break;
10488 default:
10489 break;
10490 }
10491
10492 // Hidden or internal symbols on the device are not externally visible.
10493 // We should not attempt to register them by creating an offloading
10494 // entry. Indirect variables are handled separately on the device.
10495 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10496 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10497 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10498 continue;
10499
10500 // Indirect globals need to use a special name that doesn't match the name
10501 // of the associated host global.
10502 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10503 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10504 Flags, CE->getLinkage(), CE->getVarName());
10505 else
10506 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10507 Flags, CE->getLinkage());
10508
10509 } else {
10510 llvm_unreachable("Unsupported entry kind.");
10511 }
10512 }
10513
10514 // Emit requires directive globals to a special entry so the runtime can
10515 // register them when the device image is loaded.
10516 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10517 // entries should be redesigned to better suit this use-case.
10518 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10522 ".requires", /*Size=*/0,
10523 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10524 Config.getRequiresFlags());
10525}
10526
10527void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10528 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10529 unsigned FileID, unsigned Line, unsigned Count) {
10530 raw_svector_ostream OS(Name);
10531 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10532 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10533 if (Count)
10534 OS << "_" << Count;
10535}
10536
10537void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10538 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10539 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10540 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10541 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10542 EntryInfo.Line, NewCount);
10543}
10544
10545TargetRegionEntryInfo
10546OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10547 vfs::FileSystem &VFS,
10548 StringRef ParentName) {
10549 sys::fs::UniqueID ID(0xdeadf17e, 0);
10550 auto FileIDInfo = CallBack();
10551 uint64_t FileID = 0;
10552 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10553 ID = Status->getUniqueID();
10554 FileID = Status->getUniqueID().getFile();
10555 } else {
10556 // If the inode ID could not be determined, create a hash value
10557 // the current file name and use that as an ID.
10558 FileID = hash_value(std::get<0>(FileIDInfo));
10559 }
10560
10561 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10562 std::get<1>(FileIDInfo));
10563}
10564
10565unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10566 unsigned Offset = 0;
10567 for (uint64_t Remain =
10568 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10570 !(Remain & 1); Remain = Remain >> 1)
10571 Offset++;
10572 return Offset;
10573}
10574
10576OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10577 // Rotate by getFlagMemberOffset() bits.
10578 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10579 << getFlagMemberOffset());
10580}
10581
10582void OpenMPIRBuilder::setCorrectMemberOfFlag(
10584 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10585 // If the entry is PTR_AND_OBJ but has not been marked with the special
10586 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10587 // marked as MEMBER_OF.
10588 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10590 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10593 return;
10594
10595 // Reset the placeholder value to prepare the flag for the assignment of the
10596 // proper MEMBER_OF value.
10597 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10598 Flags |= MemberOfFlag;
10599}
10600
10601Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10602 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10603 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10604 bool IsDeclaration, bool IsExternallyVisible,
10605 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10606 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10607 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10608 std::function<Constant *()> GlobalInitializer,
10609 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10610 // TODO: convert this to utilise the IRBuilder Config rather than
10611 // a passed down argument.
10612 if (OpenMPSIMD)
10613 return nullptr;
10614
10615 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10616 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10617 CaptureClause ==
10618 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10619 Config.hasRequiresUnifiedSharedMemory())) {
10620 SmallString<64> PtrName;
10621 {
10622 raw_svector_ostream OS(PtrName);
10623 OS << MangledName;
10624 if (!IsExternallyVisible)
10625 OS << format("_%x", EntryInfo.FileID);
10626 OS << "_decl_tgt_ref_ptr";
10627 }
10628
10629 Value *Ptr = M.getNamedValue(PtrName);
10630
10631 if (!Ptr) {
10632 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10633 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10634
10635 auto *GV = cast<GlobalVariable>(Ptr);
10636 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10637
10638 if (!Config.isTargetDevice()) {
10639 if (GlobalInitializer)
10640 GV->setInitializer(GlobalInitializer());
10641 else
10642 GV->setInitializer(GlobalValue);
10643 }
10644
10645 registerTargetGlobalVariable(
10646 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10647 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10648 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10649 }
10650
10651 return cast<Constant>(Ptr);
10652 }
10653
10654 return nullptr;
10655}
10656
10657void OpenMPIRBuilder::registerTargetGlobalVariable(
10658 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10659 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10660 bool IsDeclaration, bool IsExternallyVisible,
10661 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10662 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10663 std::vector<Triple> TargetTriple,
10664 std::function<Constant *()> GlobalInitializer,
10665 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10666 Constant *Addr) {
10667 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10668 (TargetTriple.empty() && !Config.isTargetDevice()))
10669 return;
10670
10671 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10673 int64_t VarSize;
10675
10676 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10677 CaptureClause ==
10678 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10679 !Config.hasRequiresUnifiedSharedMemory()) {
10680 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10681 VarName = MangledName;
10682 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10683
10684 if (!IsDeclaration)
10685 VarSize = divideCeil(
10686 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10687 else
10688 VarSize = 0;
10689 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10690
10691 // This is a workaround carried over from Clang which prevents undesired
10692 // optimisation of internal variables.
10693 if (Config.isTargetDevice() &&
10694 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10695 // Do not create a "ref-variable" if the original is not also available
10696 // on the host.
10697 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10698 return;
10699
10700 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10701
10702 if (!M.getNamedValue(RefName)) {
10703 Constant *AddrRef =
10704 getOrCreateInternalVariable(Addr->getType(), RefName);
10705 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10706 GvAddrRef->setConstant(true);
10707 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10708 GvAddrRef->setInitializer(Addr);
10709 GeneratedRefs.push_back(GvAddrRef);
10710 }
10711 }
10712 } else {
10713 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10714 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10715 else
10716 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10717
10718 if (Config.isTargetDevice()) {
10719 VarName = (Addr) ? Addr->getName() : "";
10720 Addr = nullptr;
10721 } else {
10722 Addr = getAddrOfDeclareTargetVar(
10723 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10724 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10725 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10726 VarName = (Addr) ? Addr->getName() : "";
10727 }
10728 VarSize = M.getDataLayout().getPointerSize();
10730 }
10731
10732 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10733 Flags, Linkage);
10734}
10735
10736/// Loads all the offload entries information from the host IR
10737/// metadata.
10738void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10739 // If we are in target mode, load the metadata from the host IR. This code has
10740 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10741
10742 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10743 if (!MD)
10744 return;
10745
10746 for (MDNode *MN : MD->operands()) {
10747 auto &&GetMDInt = [MN](unsigned Idx) {
10748 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10749 return cast<ConstantInt>(V->getValue())->getZExtValue();
10750 };
10751
10752 auto &&GetMDString = [MN](unsigned Idx) {
10753 auto *V = cast<MDString>(MN->getOperand(Idx));
10754 return V->getString();
10755 };
10756
10757 switch (GetMDInt(0)) {
10758 default:
10759 llvm_unreachable("Unexpected metadata!");
10760 break;
10761 case OffloadEntriesInfoManager::OffloadEntryInfo::
10762 OffloadingEntryInfoTargetRegion: {
10763 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10764 /*DeviceID=*/GetMDInt(1),
10765 /*FileID=*/GetMDInt(2),
10766 /*Line=*/GetMDInt(4),
10767 /*Count=*/GetMDInt(5));
10768 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10769 /*Order=*/GetMDInt(6));
10770 break;
10771 }
10772 case OffloadEntriesInfoManager::OffloadEntryInfo::
10773 OffloadingEntryInfoDeviceGlobalVar:
10774 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10775 /*MangledName=*/GetMDString(1),
10776 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10777 /*Flags=*/GetMDInt(2)),
10778 /*Order=*/GetMDInt(3));
10779 break;
10780 }
10781 }
10782}
10783
10784void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10785 StringRef HostFilePath) {
10786 if (HostFilePath.empty())
10787 return;
10788
10789 auto Buf = VFS.getBufferForFile(HostFilePath);
10790 if (std::error_code Err = Buf.getError()) {
10791 report_fatal_error(("error opening host file from host file path inside of "
10792 "OpenMPIRBuilder: " +
10793 Err.message())
10794 .c_str());
10795 }
10796
10797 LLVMContext Ctx;
10799 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10800 if (std::error_code Err = M.getError()) {
10802 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10803 .c_str());
10804 }
10805
10806 loadOffloadInfoMetadata(*M.get());
10807}
10808
10809//===----------------------------------------------------------------------===//
10810// OffloadEntriesInfoManager
10811//===----------------------------------------------------------------------===//
10812
10813bool OffloadEntriesInfoManager::empty() const {
10814 return OffloadEntriesTargetRegion.empty() &&
10815 OffloadEntriesDeviceGlobalVar.empty();
10816}
10817
10818unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10819 const TargetRegionEntryInfo &EntryInfo) const {
10820 auto It = OffloadEntriesTargetRegionCount.find(
10821 getTargetRegionEntryCountKey(EntryInfo));
10822 if (It == OffloadEntriesTargetRegionCount.end())
10823 return 0;
10824 return It->second;
10825}
10826
10827void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10828 const TargetRegionEntryInfo &EntryInfo) {
10829 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10830 EntryInfo.Count + 1;
10831}
10832
10833/// Initialize target region entry.
10834void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10835 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10836 OffloadEntriesTargetRegion[EntryInfo] =
10837 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10838 OMPTargetRegionEntryTargetRegion);
10839 ++OffloadingEntriesNum;
10840}
10841
10842void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10843 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10844 OMPTargetRegionEntryKind Flags) {
10845 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10846
10847 // Update the EntryInfo with the next available count for this location.
10848 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10849
10850 // If we are emitting code for a target, the entry is already initialized,
10851 // only has to be registered.
10852 if (OMPBuilder->Config.isTargetDevice()) {
10853 // This could happen if the device compilation is invoked standalone.
10854 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10855 return;
10856 }
10857 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10858 Entry.setAddress(Addr);
10859 Entry.setID(ID);
10860 Entry.setFlags(Flags);
10861 } else {
10862 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10863 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10864 return;
10865 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10866 "Target region entry already registered!");
10867 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10868 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10869 ++OffloadingEntriesNum;
10870 }
10871 incrementTargetRegionEntryInfoCount(EntryInfo);
10872}
10873
10874bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10875 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10876
10877 // Update the EntryInfo with the next available count for this location.
10878 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10879
10880 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10881 if (It == OffloadEntriesTargetRegion.end()) {
10882 return false;
10883 }
10884 // Fail if this entry is already registered.
10885 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10886 return false;
10887 return true;
10888}
10889
10890void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10891 const OffloadTargetRegionEntryInfoActTy &Action) {
10892 // Scan all target region entries and perform the provided action.
10893 for (const auto &It : OffloadEntriesTargetRegion) {
10894 Action(It.first, It.second);
10895 }
10896}
10897
10898void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10899 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10900 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10901 ++OffloadingEntriesNum;
10902}
10903
10904void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10905 StringRef VarName, Constant *Addr, int64_t VarSize,
10906 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10907 if (OMPBuilder->Config.isTargetDevice()) {
10908 // This could happen if the device compilation is invoked standalone.
10909 if (!hasDeviceGlobalVarEntryInfo(VarName))
10910 return;
10911 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10912 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10913 if (Entry.getVarSize() == 0) {
10914 Entry.setVarSize(VarSize);
10915 Entry.setLinkage(Linkage);
10916 }
10917 return;
10918 }
10919 Entry.setVarSize(VarSize);
10920 Entry.setLinkage(Linkage);
10921 Entry.setAddress(Addr);
10922 } else {
10923 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10924 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10925 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10926 "Entry not initialized!");
10927 if (Entry.getVarSize() == 0) {
10928 Entry.setVarSize(VarSize);
10929 Entry.setLinkage(Linkage);
10930 }
10931 return;
10932 }
10933 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10934 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10935 Addr, VarSize, Flags, Linkage,
10936 VarName.str());
10937 else
10938 OffloadEntriesDeviceGlobalVar.try_emplace(
10939 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10940 ++OffloadingEntriesNum;
10941 }
10942}
10943
10944void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10945 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10946 // Scan all target region entries and perform the provided action.
10947 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10948 Action(E.getKey(), E.getValue());
10949}
10950
10951//===----------------------------------------------------------------------===//
10952// CanonicalLoopInfo
10953//===----------------------------------------------------------------------===//
10954
10955void CanonicalLoopInfo::collectControlBlocks(
10957 // We only count those BBs as control block for which we do not need to
10958 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10959 // flow. For consistency, this also means we do not add the Body block, which
10960 // is just the entry to the body code.
10961 BBs.reserve(BBs.size() + 6);
10962 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10963}
10964
10965BasicBlock *CanonicalLoopInfo::getPreheader() const {
10966 assert(isValid() && "Requires a valid canonical loop");
10967 for (BasicBlock *Pred : predecessors(Header)) {
10968 if (Pred != Latch)
10969 return Pred;
10970 }
10971 llvm_unreachable("Missing preheader");
10972}
10973
10974void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10975 assert(isValid() && "Requires a valid canonical loop");
10976
10977 Instruction *CmpI = &getCond()->front();
10978 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10979 CmpI->setOperand(1, TripCount);
10980
10981#ifndef NDEBUG
10982 assertOK();
10983#endif
10984}
10985
10986void CanonicalLoopInfo::mapIndVar(
10987 llvm::function_ref<Value *(Instruction *)> Updater) {
10988 assert(isValid() && "Requires a valid canonical loop");
10989
10990 Instruction *OldIV = getIndVar();
10991
10992 // Record all uses excluding those introduced by the updater. Uses by the
10993 // CanonicalLoopInfo itself to keep track of the number of iterations are
10994 // excluded.
10995 SmallVector<Use *> ReplacableUses;
10996 for (Use &U : OldIV->uses()) {
10997 auto *User = dyn_cast<Instruction>(U.getUser());
10998 if (!User)
10999 continue;
11000 if (User->getParent() == getCond())
11001 continue;
11002 if (User->getParent() == getLatch())
11003 continue;
11004 ReplacableUses.push_back(&U);
11005 }
11006
11007 // Run the updater that may introduce new uses
11008 Value *NewIV = Updater(OldIV);
11009
11010 // Replace the old uses with the value returned by the updater.
11011 for (Use *U : ReplacableUses)
11012 U->set(NewIV);
11013
11014#ifndef NDEBUG
11015 assertOK();
11016#endif
11017}
11018
11019void CanonicalLoopInfo::assertOK() const {
11020#ifndef NDEBUG
11021 // No constraints if this object currently does not describe a loop.
11022 if (!isValid())
11023 return;
11024
11025 BasicBlock *Preheader = getPreheader();
11026 BasicBlock *Body = getBody();
11027 BasicBlock *After = getAfter();
11028
11029 // Verify standard control-flow we use for OpenMP loops.
11030 assert(Preheader);
11031 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11032 "Preheader must terminate with unconditional branch");
11033 assert(Preheader->getSingleSuccessor() == Header &&
11034 "Preheader must jump to header");
11035
11036 assert(Header);
11037 assert(isa<BranchInst>(Header->getTerminator()) &&
11038 "Header must terminate with unconditional branch");
11039 assert(Header->getSingleSuccessor() == Cond &&
11040 "Header must jump to exiting block");
11041
11042 assert(Cond);
11043 assert(Cond->getSinglePredecessor() == Header &&
11044 "Exiting block only reachable from header");
11045
11046 assert(isa<BranchInst>(Cond->getTerminator()) &&
11047 "Exiting block must terminate with conditional branch");
11048 assert(size(successors(Cond)) == 2 &&
11049 "Exiting block must have two successors");
11050 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11051 "Exiting block's first successor jump to the body");
11052 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11053 "Exiting block's second successor must exit the loop");
11054
11055 assert(Body);
11056 assert(Body->getSinglePredecessor() == Cond &&
11057 "Body only reachable from exiting block");
11058 assert(!isa<PHINode>(Body->front()));
11059
11060 assert(Latch);
11062 "Latch must terminate with unconditional branch");
11063 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11064 // TODO: To support simple redirecting of the end of the body code that has
11065 // multiple; introduce another auxiliary basic block like preheader and after.
11066 assert(Latch->getSinglePredecessor() != nullptr);
11067 assert(!isa<PHINode>(Latch->front()));
11068
11069 assert(Exit);
11070 assert(isa<BranchInst>(Exit->getTerminator()) &&
11071 "Exit block must terminate with unconditional branch");
11072 assert(Exit->getSingleSuccessor() == After &&
11073 "Exit block must jump to after block");
11074
11075 assert(After);
11076 assert(After->getSinglePredecessor() == Exit &&
11077 "After block only reachable from exit block");
11078 assert(After->empty() || !isa<PHINode>(After->front()));
11079
11080 Instruction *IndVar = getIndVar();
11081 assert(IndVar && "Canonical induction variable not found?");
11082 assert(isa<IntegerType>(IndVar->getType()) &&
11083 "Induction variable must be an integer");
11084 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11085 "Induction variable must be a PHI in the loop header");
11086 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11087 assert(
11088 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11089 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11090
11091 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11092 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11093 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11094 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11095 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11096 ->isOne());
11097
11098 Value *TripCount = getTripCount();
11099 assert(TripCount && "Loop trip count not found?");
11100 assert(IndVar->getType() == TripCount->getType() &&
11101 "Trip count and induction variable must have the same type");
11102
11103 auto *CmpI = cast<CmpInst>(&Cond->front());
11104 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11105 "Exit condition must be a signed less-than comparison");
11106 assert(CmpI->getOperand(0) == IndVar &&
11107 "Exit condition must compare the induction variable");
11108 assert(CmpI->getOperand(1) == TripCount &&
11109 "Exit condition must compare with the trip count");
11110#endif
11111}
11112
11113void CanonicalLoopInfo::invalidate() {
11114 Header = nullptr;
11115 Cond = nullptr;
11116 Latch = nullptr;
11117 Exit = nullptr;
11118}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:640
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:447
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:668
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:58
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:228
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1064
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1126
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:413
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1142
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:360
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...