LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
45#include "llvm/IR/Value.h"
57
58#include <cstdint>
59#include <optional>
60#include <stack>
61
62#define DEBUG_TYPE "openmp-ir-builder"
63
64using namespace llvm;
65using namespace omp;
66
67static cl::opt<bool>
68 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
69 cl::desc("Use optimistic attributes describing "
70 "'as-if' properties of runtime calls."),
71 cl::init(false));
72
74 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
75 cl::desc("Factor for the unroll threshold to account for code "
76 "simplifications still taking place"),
77 cl::init(1.5));
78
79#ifndef NDEBUG
80/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
81/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
82/// an InsertPoint stores the instruction before something is inserted. For
83/// instance, if both point to the same instruction, two IRBuilders alternating
84/// creating instruction will cause the instructions to be interleaved.
87 if (!IP1.isSet() || !IP2.isSet())
88 return false;
89 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
90}
91
93 // Valid ordered/unordered and base algorithm combinations.
94 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
95 case OMPScheduleType::UnorderedStaticChunked:
96 case OMPScheduleType::UnorderedStatic:
97 case OMPScheduleType::UnorderedDynamicChunked:
98 case OMPScheduleType::UnorderedGuidedChunked:
99 case OMPScheduleType::UnorderedRuntime:
100 case OMPScheduleType::UnorderedAuto:
101 case OMPScheduleType::UnorderedTrapezoidal:
102 case OMPScheduleType::UnorderedGreedy:
103 case OMPScheduleType::UnorderedBalanced:
104 case OMPScheduleType::UnorderedGuidedIterativeChunked:
105 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
106 case OMPScheduleType::UnorderedSteal:
107 case OMPScheduleType::UnorderedStaticBalancedChunked:
108 case OMPScheduleType::UnorderedGuidedSimd:
109 case OMPScheduleType::UnorderedRuntimeSimd:
110 case OMPScheduleType::OrderedStaticChunked:
111 case OMPScheduleType::OrderedStatic:
112 case OMPScheduleType::OrderedDynamicChunked:
113 case OMPScheduleType::OrderedGuidedChunked:
114 case OMPScheduleType::OrderedRuntime:
115 case OMPScheduleType::OrderedAuto:
116 case OMPScheduleType::OrderdTrapezoidal:
117 case OMPScheduleType::NomergeUnorderedStaticChunked:
118 case OMPScheduleType::NomergeUnorderedStatic:
119 case OMPScheduleType::NomergeUnorderedDynamicChunked:
120 case OMPScheduleType::NomergeUnorderedGuidedChunked:
121 case OMPScheduleType::NomergeUnorderedRuntime:
122 case OMPScheduleType::NomergeUnorderedAuto:
123 case OMPScheduleType::NomergeUnorderedTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedGreedy:
125 case OMPScheduleType::NomergeUnorderedBalanced:
126 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
128 case OMPScheduleType::NomergeUnorderedSteal:
129 case OMPScheduleType::NomergeOrderedStaticChunked:
130 case OMPScheduleType::NomergeOrderedStatic:
131 case OMPScheduleType::NomergeOrderedDynamicChunked:
132 case OMPScheduleType::NomergeOrderedGuidedChunked:
133 case OMPScheduleType::NomergeOrderedRuntime:
134 case OMPScheduleType::NomergeOrderedAuto:
135 case OMPScheduleType::NomergeOrderedTrapezoidal:
136 break;
137 default:
138 return false;
139 }
140
141 // Must not set both monotonicity modifiers at the same time.
142 OMPScheduleType MonotonicityFlags =
143 SchedType & OMPScheduleType::MonotonicityMask;
144 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
145 return false;
146
147 return true;
148}
149#endif
150
151static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
152 if (T.isAMDGPU()) {
153 StringRef Features =
154 Kernel->getFnAttribute("target-features").getValueAsString();
155 if (Features.count("+wavefrontsize64"))
156 return omp::getAMDGPUGridValues<64>();
157 return omp::getAMDGPUGridValues<32>();
158 }
159 if (T.isNVPTX())
161 llvm_unreachable("No grid value available for this architecture!");
162}
163
164/// Determine which scheduling algorithm to use, determined from schedule clause
165/// arguments.
166static OMPScheduleType
167getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
168 bool HasSimdModifier) {
169 // Currently, the default schedule it static.
170 switch (ClauseKind) {
171 case OMP_SCHEDULE_Default:
172 case OMP_SCHEDULE_Static:
173 return HasChunks ? OMPScheduleType::BaseStaticChunked
174 : OMPScheduleType::BaseStatic;
175 case OMP_SCHEDULE_Dynamic:
176 return OMPScheduleType::BaseDynamicChunked;
177 case OMP_SCHEDULE_Guided:
178 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
179 : OMPScheduleType::BaseGuidedChunked;
180 case OMP_SCHEDULE_Auto:
182 case OMP_SCHEDULE_Runtime:
183 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
184 : OMPScheduleType::BaseRuntime;
185 }
186 llvm_unreachable("unhandled schedule clause argument");
187}
188
189/// Adds ordering modifier flags to schedule type.
190static OMPScheduleType
192 bool HasOrderedClause) {
193 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
194 OMPScheduleType::None &&
195 "Must not have ordering nor monotonicity flags already set");
196
197 OMPScheduleType OrderingModifier = HasOrderedClause
198 ? OMPScheduleType::ModifierOrdered
199 : OMPScheduleType::ModifierUnordered;
200 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
201
202 // Unsupported combinations
203 if (OrderingScheduleType ==
204 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedGuidedChunked;
206 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
207 OMPScheduleType::ModifierOrdered))
208 return OMPScheduleType::OrderedRuntime;
209
210 return OrderingScheduleType;
211}
212
213/// Adds monotonicity modifier flags to schedule type.
214static OMPScheduleType
216 bool HasSimdModifier, bool HasMonotonic,
217 bool HasNonmonotonic, bool HasOrderedClause) {
218 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
219 OMPScheduleType::None &&
220 "Must not have monotonicity flags already set");
221 assert((!HasMonotonic || !HasNonmonotonic) &&
222 "Monotonic and Nonmonotonic are contradicting each other");
223
224 if (HasMonotonic) {
225 return ScheduleType | OMPScheduleType::ModifierMonotonic;
226 } else if (HasNonmonotonic) {
227 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
228 } else {
229 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
230 // If the static schedule kind is specified or if the ordered clause is
231 // specified, and if the nonmonotonic modifier is not specified, the
232 // effect is as if the monotonic modifier is specified. Otherwise, unless
233 // the monotonic modifier is specified, the effect is as if the
234 // nonmonotonic modifier is specified.
235 OMPScheduleType BaseScheduleType =
236 ScheduleType & ~OMPScheduleType::ModifierMask;
237 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
238 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
239 HasOrderedClause) {
240 // The monotonic is used by default in openmp runtime library, so no need
241 // to set it.
242 return ScheduleType;
243 } else {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 }
246 }
247}
248
249/// Determine the schedule type using schedule and ordering clause arguments.
250static OMPScheduleType
251computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
252 bool HasSimdModifier, bool HasMonotonicModifier,
253 bool HasNonmonotonicModifier, bool HasOrderedClause) {
254 OMPScheduleType BaseSchedule =
255 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
256 OMPScheduleType OrderedSchedule =
257 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
259 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
260 HasNonmonotonicModifier, HasOrderedClause);
261
263 return Result;
264}
265
266/// Make \p Source branch to \p Target.
267///
268/// Handles two situations:
269/// * \p Source already has an unconditional branch.
270/// * \p Source is a degenerate block (no terminator because the BB is
271/// the current head of the IR construction).
273 if (Instruction *Term = Source->getTerminator()) {
274 auto *Br = cast<BranchInst>(Term);
275 assert(!Br->isConditional() &&
276 "BB's terminator must be an unconditional branch (or degenerate)");
277 BasicBlock *Succ = Br->getSuccessor(0);
278 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
279 Br->setSuccessor(0, Target);
280 return;
281 }
282
283 auto *NewBr = BranchInst::Create(Target, Source);
284 NewBr->setDebugLoc(DL);
285}
286
288 bool CreateBranch) {
289 assert(New->getFirstInsertionPt() == New->begin() &&
290 "Target BB must not have PHI nodes");
291
292 // Move instructions to new block.
293 BasicBlock *Old = IP.getBlock();
294 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
295
296 if (CreateBranch)
297 BranchInst::Create(New, Old);
298}
299
300void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
302 BasicBlock *Old = Builder.GetInsertBlock();
303
304 spliceBB(Builder.saveIP(), New, CreateBranch);
305 if (CreateBranch)
306 Builder.SetInsertPoint(Old->getTerminator());
307 else
308 Builder.SetInsertPoint(Old);
309
310 // SetInsertPoint also updates the Builder's debug location, but we want to
311 // keep the one the Builder was configured to use.
313}
314
317 BasicBlock *Old = IP.getBlock();
319 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
320 Old->getParent(), Old->getNextNode());
321 spliceBB(IP, New, CreateBranch);
322 New->replaceSuccessorsPhiUsesWith(Old, New);
323 return New;
324}
325
326BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
329 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
330 if (CreateBranch)
331 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
332 else
333 Builder.SetInsertPoint(Builder.GetInsertBlock());
334 // SetInsertPoint also updates the Builder's debug location, but we want to
335 // keep the one the Builder was configured to use.
337 return New;
338}
339
340BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
343 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
344 if (CreateBranch)
345 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
346 else
347 Builder.SetInsertPoint(Builder.GetInsertBlock());
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
351 return New;
352}
353
355 llvm::Twine Suffix) {
356 BasicBlock *Old = Builder.GetInsertBlock();
357 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
358}
359
360// This function creates a fake integer value and a fake use for the integer
361// value. It returns the fake value created. This is useful in modeling the
362// extra arguments to the outlined functions.
364 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
366 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
367 const Twine &Name = "", bool AsPtr = true) {
368 Builder.restoreIP(OuterAllocaIP);
369 Instruction *FakeVal;
370 AllocaInst *FakeValAddr =
371 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
372 ToBeDeleted.push_back(FakeValAddr);
373
374 if (AsPtr) {
375 FakeVal = FakeValAddr;
376 } else {
377 FakeVal =
378 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
379 ToBeDeleted.push_back(FakeVal);
380 }
381
382 // Generate a fake use of this value
383 Builder.restoreIP(InnerAllocaIP);
384 Instruction *UseFakeVal;
385 if (AsPtr) {
386 UseFakeVal =
387 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
388 } else {
389 UseFakeVal =
390 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
391 }
392 ToBeDeleted.push_back(UseFakeVal);
393 return FakeVal;
394}
395
396//===----------------------------------------------------------------------===//
397// OpenMPIRBuilderConfig
398//===----------------------------------------------------------------------===//
399
400namespace {
402/// Values for bit flags for marking which requires clauses have been used.
403enum OpenMPOffloadingRequiresDirFlags {
404 /// flag undefined.
405 OMP_REQ_UNDEFINED = 0x000,
406 /// no requires directive present.
407 OMP_REQ_NONE = 0x001,
408 /// reverse_offload clause.
409 OMP_REQ_REVERSE_OFFLOAD = 0x002,
410 /// unified_address clause.
411 OMP_REQ_UNIFIED_ADDRESS = 0x004,
412 /// unified_shared_memory clause.
413 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
414 /// dynamic_allocators clause.
415 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
416 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
417};
418
419} // anonymous namespace
420
422 : RequiresFlags(OMP_REQ_UNDEFINED) {}
423
425 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
426 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
427 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
428 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
429 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
430 RequiresFlags(OMP_REQ_UNDEFINED) {
431 if (HasRequiresReverseOffload)
432 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
433 if (HasRequiresUnifiedAddress)
434 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
435 if (HasRequiresUnifiedSharedMemory)
436 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
437 if (HasRequiresDynamicAllocators)
438 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
439}
440
442 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
443}
444
446 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
447}
448
450 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
451}
452
454 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
455}
456
458 return hasRequiresFlags() ? RequiresFlags
459 : static_cast<int64_t>(OMP_REQ_NONE);
460}
461
463 if (Value)
464 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
465 else
466 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
467}
468
470 if (Value)
471 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
472 else
473 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
474}
475
477 if (Value)
478 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
479 else
480 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 if (Value)
485 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
486 else
487 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
488}
489
490//===----------------------------------------------------------------------===//
491// OpenMPIRBuilder
492//===----------------------------------------------------------------------===//
493
495 IRBuilderBase &Builder,
496 SmallVector<Value *> &ArgsVector) {
498 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
499 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
500 constexpr const size_t MaxDim = 3;
501 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
502 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
503
504 assert(!KernelArgs.NumTeams.empty());
505
506 Value *NumTeams3D =
507 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
508 for (unsigned I = 1; I < std::min(KernelArgs.NumTeams.size(), MaxDim); ++I)
509 NumTeams3D =
510 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
511 Value *NumThreads3D =
512 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
513
514 ArgsVector = {Version,
515 PointerNum,
516 KernelArgs.RTArgs.BasePointersArray,
517 KernelArgs.RTArgs.PointersArray,
518 KernelArgs.RTArgs.SizesArray,
519 KernelArgs.RTArgs.MapTypesArray,
520 KernelArgs.RTArgs.MapNamesArray,
521 KernelArgs.RTArgs.MappersArray,
522 KernelArgs.NumIterations,
523 Flags,
524 NumTeams3D,
525 NumThreads3D,
526 KernelArgs.DynCGGroupMem};
527}
528
530 LLVMContext &Ctx = Fn.getContext();
531
532 // Get the function's current attributes.
533 auto Attrs = Fn.getAttributes();
534 auto FnAttrs = Attrs.getFnAttrs();
535 auto RetAttrs = Attrs.getRetAttrs();
537 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
538 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
539
540 // Add AS to FnAS while taking special care with integer extensions.
541 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
542 bool Param = true) -> void {
543 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
544 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
545 if (HasSignExt || HasZeroExt) {
546 assert(AS.getNumAttributes() == 1 &&
547 "Currently not handling extension attr combined with others.");
548 if (Param) {
549 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
550 FnAS = FnAS.addAttribute(Ctx, AK);
551 } else if (auto AK =
552 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
553 FnAS = FnAS.addAttribute(Ctx, AK);
554 } else {
555 FnAS = FnAS.addAttributes(Ctx, AS);
556 }
557 };
558
559#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
560#include "llvm/Frontend/OpenMP/OMPKinds.def"
561
562 // Add attributes to the function declaration.
563 switch (FnID) {
564#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
565 case Enum: \
566 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
567 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
568 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
569 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
570 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
571 break;
572#include "llvm/Frontend/OpenMP/OMPKinds.def"
573 default:
574 // Attributes are optional.
575 break;
576 }
577}
578
581 FunctionType *FnTy = nullptr;
582 Function *Fn = nullptr;
583
584 // Try to find the declation in the module first.
585 switch (FnID) {
586#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
587 case Enum: \
588 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
589 IsVarArg); \
590 Fn = M.getFunction(Str); \
591 break;
592#include "llvm/Frontend/OpenMP/OMPKinds.def"
593 }
594
595 if (!Fn) {
596 // Create a new declaration if we need one.
597 switch (FnID) {
598#define OMP_RTL(Enum, Str, ...) \
599 case Enum: \
600 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
601 break;
602#include "llvm/Frontend/OpenMP/OMPKinds.def"
603 }
604
605 // Add information if the runtime function takes a callback function
606 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
607 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
608 LLVMContext &Ctx = Fn->getContext();
609 MDBuilder MDB(Ctx);
610 // Annotate the callback behavior of the runtime function:
611 // - The callback callee is argument number 2 (microtask).
612 // - The first two arguments of the callback callee are unknown (-1).
613 // - All variadic arguments to the runtime function are passed to the
614 // callback callee.
615 Fn->addMetadata(
616 LLVMContext::MD_callback,
618 2, {-1, -1}, /* VarArgsArePassed */ true)}));
619 }
620 }
621
622 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
623 << " with type " << *Fn->getFunctionType() << "\n");
624 addAttributes(FnID, *Fn);
625
626 } else {
627 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
628 << " with type " << *Fn->getFunctionType() << "\n");
629 }
630
631 assert(Fn && "Failed to create OpenMP runtime function");
632
633 return {FnTy, Fn};
634}
635
638 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
639 assert(Fn && "Failed to create OpenMP runtime function pointer");
640 return Fn;
641}
642
643void OpenMPIRBuilder::initialize() { initializeTypes(M); }
644
647 BasicBlock &EntryBlock = Function->getEntryBlock();
648 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
649
650 // Loop over blocks looking for constant allocas, skipping the entry block
651 // as any allocas there are already in the desired location.
652 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
653 Block++) {
654 for (auto Inst = Block->getReverseIterator()->begin();
655 Inst != Block->getReverseIterator()->end();) {
656 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
657 Inst++;
658 if (!isa<ConstantData>(AllocaInst->getArraySize()))
659 continue;
660 AllocaInst->moveBeforePreserving(MoveLocInst);
661 } else {
662 Inst++;
663 }
664 }
665 }
666}
667
669 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
671 SmallVector<OutlineInfo, 16> DeferredOutlines;
672 for (OutlineInfo &OI : OutlineInfos) {
673 // Skip functions that have not finalized yet; may happen with nested
674 // function generation.
675 if (Fn && OI.getFunction() != Fn) {
676 DeferredOutlines.push_back(OI);
677 continue;
678 }
679
680 ParallelRegionBlockSet.clear();
681 Blocks.clear();
682 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
683
684 Function *OuterFn = OI.getFunction();
685 CodeExtractorAnalysisCache CEAC(*OuterFn);
686 // If we generate code for the target device, we need to allocate
687 // struct for aggregate params in the device default alloca address space.
688 // OpenMP runtime requires that the params of the extracted functions are
689 // passed as zero address space pointers. This flag ensures that
690 // CodeExtractor generates correct code for extracted functions
691 // which are used by OpenMP runtime.
692 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
693 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
694 /* AggregateArgs */ true,
695 /* BlockFrequencyInfo */ nullptr,
696 /* BranchProbabilityInfo */ nullptr,
697 /* AssumptionCache */ nullptr,
698 /* AllowVarArgs */ true,
699 /* AllowAlloca */ true,
700 /* AllocaBlock*/ OI.OuterAllocaBB,
701 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
702
703 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
704 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
705 << " Exit: " << OI.ExitBB->getName() << "\n");
706 assert(Extractor.isEligible() &&
707 "Expected OpenMP outlining to be possible!");
708
709 for (auto *V : OI.ExcludeArgsFromAggregate)
710 Extractor.excludeArgFromAggregate(V);
711
712 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
713
714 // Forward target-cpu, target-features attributes to the outlined function.
715 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
716 if (TargetCpuAttr.isStringAttribute())
717 OutlinedFn->addFnAttr(TargetCpuAttr);
718
719 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
720 if (TargetFeaturesAttr.isStringAttribute())
721 OutlinedFn->addFnAttr(TargetFeaturesAttr);
722
723 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
724 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
725 assert(OutlinedFn->getReturnType()->isVoidTy() &&
726 "OpenMP outlined functions should not return a value!");
727
728 // For compability with the clang CG we move the outlined function after the
729 // one with the parallel region.
730 OutlinedFn->removeFromParent();
731 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
732
733 // Remove the artificial entry introduced by the extractor right away, we
734 // made our own entry block after all.
735 {
736 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
737 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
738 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
739 // Move instructions from the to-be-deleted ArtificialEntry to the entry
740 // basic block of the parallel region. CodeExtractor generates
741 // instructions to unwrap the aggregate argument and may sink
742 // allocas/bitcasts for values that are solely used in the outlined region
743 // and do not escape.
744 assert(!ArtificialEntry.empty() &&
745 "Expected instructions to add in the outlined region entry");
746 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
747 End = ArtificialEntry.rend();
748 It != End;) {
749 Instruction &I = *It;
750 It++;
751
752 if (I.isTerminator())
753 continue;
754
755 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
756 }
757
758 OI.EntryBB->moveBefore(&ArtificialEntry);
759 ArtificialEntry.eraseFromParent();
760 }
761 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
762 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
763
764 // Run a user callback, e.g. to add attributes.
765 if (OI.PostOutlineCB)
766 OI.PostOutlineCB(*OutlinedFn);
767 }
768
769 // Remove work items that have been completed.
770 OutlineInfos = std::move(DeferredOutlines);
771
772 // The createTarget functions embeds user written code into
773 // the target region which may inject allocas which need to
774 // be moved to the entry block of our target or risk malformed
775 // optimisations by later passes, this is only relevant for
776 // the device pass which appears to be a little more delicate
777 // when it comes to optimisations (however, we do not block on
778 // that here, it's up to the inserter to the list to do so).
779 // This notbaly has to occur after the OutlinedInfo candidates
780 // have been extracted so we have an end product that will not
781 // be implicitly adversely affected by any raises unless
782 // intentionally appended to the list.
783 // NOTE: This only does so for ConstantData, it could be extended
784 // to ConstantExpr's with further effort, however, they should
785 // largely be folded when they get here. Extending it to runtime
786 // defined/read+writeable allocation sizes would be non-trivial
787 // (need to factor in movement of any stores to variables the
788 // allocation size depends on, as well as the usual loads,
789 // otherwise it'll yield the wrong result after movement) and
790 // likely be more suitable as an LLVM optimisation pass.
793
794 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
795 [](EmitMetadataErrorKind Kind,
796 const TargetRegionEntryInfo &EntryInfo) -> void {
797 errs() << "Error of kind: " << Kind
798 << " when emitting offload entries and metadata during "
799 "OMPIRBuilder finalization \n";
800 };
801
804
805 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
806 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
807 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
808 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
809 }
810}
811
813 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
814}
815
818 auto *GV =
819 new GlobalVariable(M, I32Ty,
820 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
821 ConstantInt::get(I32Ty, Value), Name);
822 GV->setVisibility(GlobalValue::HiddenVisibility);
823
824 return GV;
825}
826
828 uint32_t SrcLocStrSize,
829 IdentFlag LocFlags,
830 unsigned Reserve2Flags) {
831 // Enable "C-mode".
832 LocFlags |= OMP_IDENT_FLAG_KMPC;
833
834 Constant *&Ident =
835 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
836 if (!Ident) {
838 Constant *IdentData[] = {I32Null,
839 ConstantInt::get(Int32, uint32_t(LocFlags)),
840 ConstantInt::get(Int32, Reserve2Flags),
841 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
842 Constant *Initializer =
843 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
844
845 // Look for existing encoding of the location + flags, not needed but
846 // minimizes the difference to the existing solution while we transition.
847 for (GlobalVariable &GV : M.globals())
848 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
849 if (GV.getInitializer() == Initializer)
850 Ident = &GV;
851
852 if (!Ident) {
853 auto *GV = new GlobalVariable(
854 M, OpenMPIRBuilder::Ident,
855 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
858 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
859 GV->setAlignment(Align(8));
860 Ident = GV;
861 }
862 }
863
865}
866
868 uint32_t &SrcLocStrSize) {
869 SrcLocStrSize = LocStr.size();
870 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
871 if (!SrcLocStr) {
872 Constant *Initializer =
874
875 // Look for existing encoding of the location, not needed but minimizes the
876 // difference to the existing solution while we transition.
877 for (GlobalVariable &GV : M.globals())
878 if (GV.isConstant() && GV.hasInitializer() &&
879 GV.getInitializer() == Initializer)
880 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
881
882 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
883 /* AddressSpace */ 0, &M);
884 }
885 return SrcLocStr;
886}
887
889 StringRef FileName,
890 unsigned Line, unsigned Column,
891 uint32_t &SrcLocStrSize) {
892 SmallString<128> Buffer;
893 Buffer.push_back(';');
894 Buffer.append(FileName);
895 Buffer.push_back(';');
896 Buffer.append(FunctionName);
897 Buffer.push_back(';');
898 Buffer.append(std::to_string(Line));
899 Buffer.push_back(';');
900 Buffer.append(std::to_string(Column));
901 Buffer.push_back(';');
902 Buffer.push_back(';');
903 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
904}
905
906Constant *
908 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
909 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
910}
911
913 uint32_t &SrcLocStrSize,
914 Function *F) {
915 DILocation *DIL = DL.get();
916 if (!DIL)
917 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
918 StringRef FileName = M.getName();
919 if (DIFile *DIF = DIL->getFile())
920 if (std::optional<StringRef> Source = DIF->getSource())
921 FileName = *Source;
922 StringRef Function = DIL->getScope()->getSubprogram()->getName();
923 if (Function.empty() && F)
924 Function = F->getName();
925 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
926 DIL->getColumn(), SrcLocStrSize);
927}
928
930 uint32_t &SrcLocStrSize) {
931 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
932 Loc.IP.getBlock()->getParent());
933}
934
936 return Builder.CreateCall(
937 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
938 "omp_global_thread_num");
939}
940
943 bool ForceSimpleCall, bool CheckCancelFlag) {
944 if (!updateToLocation(Loc))
945 return Loc.IP;
946
947 // Build call __kmpc_cancel_barrier(loc, thread_id) or
948 // __kmpc_barrier(loc, thread_id);
949
950 IdentFlag BarrierLocFlags;
951 switch (Kind) {
952 case OMPD_for:
953 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
954 break;
955 case OMPD_sections:
956 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
957 break;
958 case OMPD_single:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
960 break;
961 case OMPD_barrier:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
963 break;
964 default:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
966 break;
967 }
968
969 uint32_t SrcLocStrSize;
970 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
971 Value *Args[] = {
972 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
973 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
974
975 // If we are in a cancellable parallel region, barriers are cancellation
976 // points.
977 // TODO: Check why we would force simple calls or to ignore the cancel flag.
978 bool UseCancelBarrier =
979 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
980
981 Value *Result =
983 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
984 : OMPRTL___kmpc_barrier),
985 Args);
986
987 if (UseCancelBarrier && CheckCancelFlag)
988 emitCancelationCheckImpl(Result, OMPD_parallel);
989
990 return Builder.saveIP();
991}
992
995 Value *IfCondition,
996 omp::Directive CanceledDirective) {
997 if (!updateToLocation(Loc))
998 return Loc.IP;
999
1000 // LLVM utilities like blocks with terminators.
1001 auto *UI = Builder.CreateUnreachable();
1002
1003 Instruction *ThenTI = UI, *ElseTI = nullptr;
1004 if (IfCondition)
1005 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1006 Builder.SetInsertPoint(ThenTI);
1007
1008 Value *CancelKind = nullptr;
1009 switch (CanceledDirective) {
1010#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1011 case DirectiveEnum: \
1012 CancelKind = Builder.getInt32(Value); \
1013 break;
1014#include "llvm/Frontend/OpenMP/OMPKinds.def"
1015 default:
1016 llvm_unreachable("Unknown cancel kind!");
1017 }
1018
1019 uint32_t SrcLocStrSize;
1020 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1021 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1022 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1023 Value *Result = Builder.CreateCall(
1024 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1025 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1026 if (CanceledDirective == OMPD_parallel) {
1028 Builder.restoreIP(IP);
1030 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1031 /* CheckCancelFlag */ false);
1032 }
1033 };
1034
1035 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1036 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1037
1038 // Update the insertion point and remove the terminator we introduced.
1039 Builder.SetInsertPoint(UI->getParent());
1040 UI->eraseFromParent();
1041
1042 return Builder.saveIP();
1043}
1044
1046 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1047 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1048 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1049 if (!updateToLocation(Loc))
1050 return Loc.IP;
1051
1052 Builder.restoreIP(AllocaIP);
1053 auto *KernelArgsPtr =
1054 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1055 Builder.restoreIP(Loc.IP);
1056
1057 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1058 llvm::Value *Arg =
1059 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1061 KernelArgs[I], Arg,
1062 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1063 }
1064
1065 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1066 NumThreads, HostPtr, KernelArgsPtr};
1067
1068 Return = Builder.CreateCall(
1069 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1070 OffloadingArgs);
1071
1072 return Builder.saveIP();
1073}
1074
1076 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1077 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1078 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1079
1080 if (!updateToLocation(Loc))
1081 return Loc.IP;
1082
1083 Builder.restoreIP(Loc.IP);
1084 // On top of the arrays that were filled up, the target offloading call
1085 // takes as arguments the device id as well as the host pointer. The host
1086 // pointer is used by the runtime library to identify the current target
1087 // region, so it only has to be unique and not necessarily point to
1088 // anything. It could be the pointer to the outlined function that
1089 // implements the target region, but we aren't using that so that the
1090 // compiler doesn't need to keep that, and could therefore inline the host
1091 // function if proven worthwhile during optimization.
1092
1093 // From this point on, we need to have an ID of the target region defined.
1094 assert(OutlinedFnID && "Invalid outlined function ID!");
1095 (void)OutlinedFnID;
1096
1097 // Return value of the runtime offloading call.
1098 Value *Return = nullptr;
1099
1100 // Arguments for the target kernel.
1101 SmallVector<Value *> ArgsVector;
1102 getKernelArgsVector(Args, Builder, ArgsVector);
1103
1104 // The target region is an outlined function launched by the runtime
1105 // via calls to __tgt_target_kernel().
1106 //
1107 // Note that on the host and CPU targets, the runtime implementation of
1108 // these calls simply call the outlined function without forking threads.
1109 // The outlined functions themselves have runtime calls to
1110 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1111 // the compiler in emitTeamsCall() and emitParallelCall().
1112 //
1113 // In contrast, on the NVPTX target, the implementation of
1114 // __tgt_target_teams() launches a GPU kernel with the requested number
1115 // of teams and threads so no additional calls to the runtime are required.
1116 // Check the error code and execute the host version if required.
1117 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1118 Args.NumTeams.front(), Args.NumThreads,
1119 OutlinedFnID, ArgsVector));
1120
1121 BasicBlock *OffloadFailedBlock =
1122 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1123 BasicBlock *OffloadContBlock =
1124 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1126 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1127
1128 auto CurFn = Builder.GetInsertBlock()->getParent();
1129 emitBlock(OffloadFailedBlock, CurFn);
1130 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1131 emitBranch(OffloadContBlock);
1132 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1133 return Builder.saveIP();
1134}
1135
1137 omp::Directive CanceledDirective,
1138 FinalizeCallbackTy ExitCB) {
1139 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1140 "Unexpected cancellation!");
1141
1142 // For a cancel barrier we create two new blocks.
1144 BasicBlock *NonCancellationBlock;
1145 if (Builder.GetInsertPoint() == BB->end()) {
1146 // TODO: This branch will not be needed once we moved to the
1147 // OpenMPIRBuilder codegen completely.
1148 NonCancellationBlock = BasicBlock::Create(
1149 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1150 } else {
1151 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1154 }
1155 BasicBlock *CancellationBlock = BasicBlock::Create(
1156 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1157
1158 // Jump to them based on the return value.
1159 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1160 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1161 /* TODO weight */ nullptr, nullptr);
1162
1163 // From the cancellation block we finalize all variables and go to the
1164 // post finalization block that is known to the FiniCB callback.
1165 Builder.SetInsertPoint(CancellationBlock);
1166 if (ExitCB)
1167 ExitCB(Builder.saveIP());
1168 auto &FI = FinalizationStack.back();
1169 FI.FiniCB(Builder.saveIP());
1170
1171 // The continuation block is where code generation continues.
1172 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1173}
1174
1175// Callback used to create OpenMP runtime calls to support
1176// omp parallel clause for the device.
1177// We need to use this callback to replace call to the OutlinedFn in OuterFn
1178// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1180 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1181 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1182 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1183 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1184 // Add some known attributes.
1185 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1186 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1187 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1188 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1189 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1190 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1191
1192 assert(OutlinedFn.arg_size() >= 2 &&
1193 "Expected at least tid and bounded tid as arguments");
1194 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1195
1196 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1197 assert(CI && "Expected call instruction to outlined function");
1198 CI->getParent()->setName("omp_parallel");
1199
1200 Builder.SetInsertPoint(CI);
1201 Type *PtrTy = OMPIRBuilder->VoidPtr;
1202 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1203
1204 // Add alloca for kernel args
1205 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1206 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1207 AllocaInst *ArgsAlloca =
1208 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1209 Value *Args = ArgsAlloca;
1210 // Add address space cast if array for storing arguments is not allocated
1211 // in address space 0
1212 if (ArgsAlloca->getAddressSpace())
1213 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1214 Builder.restoreIP(CurrentIP);
1215
1216 // Store captured vars which are used by kmpc_parallel_51
1217 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1218 Value *V = *(CI->arg_begin() + 2 + Idx);
1219 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1220 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1221 Builder.CreateStore(V, StoreAddress);
1222 }
1223
1224 Value *Cond =
1225 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1226 : Builder.getInt32(1);
1227
1228 // Build kmpc_parallel_51 call
1229 Value *Parallel51CallArgs[] = {
1230 /* identifier*/ Ident,
1231 /* global thread num*/ ThreadID,
1232 /* if expression */ Cond,
1233 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1234 /* Proc bind */ Builder.getInt32(-1),
1235 /* outlined function */
1236 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1237 /* wrapper function */ NullPtrValue,
1238 /* arguments of the outlined funciton*/ Args,
1239 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1240
1241 FunctionCallee RTLFn =
1242 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1243
1244 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1245
1246 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1247 << *Builder.GetInsertBlock()->getParent() << "\n");
1248
1249 // Initialize the local TID stack location with the argument value.
1250 Builder.SetInsertPoint(PrivTID);
1251 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1252 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1253 PrivTIDAddr);
1254
1255 // Remove redundant call to the outlined function.
1256 CI->eraseFromParent();
1257
1258 for (Instruction *I : ToBeDeleted) {
1259 I->eraseFromParent();
1260 }
1261}
1262
1263// Callback used to create OpenMP runtime calls to support
1264// omp parallel clause for the host.
1265// We need to use this callback to replace call to the OutlinedFn in OuterFn
1266// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1267static void
1269 Function *OuterFn, Value *Ident, Value *IfCondition,
1270 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1271 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1272 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1273 FunctionCallee RTLFn;
1274 if (IfCondition) {
1275 RTLFn =
1276 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1277 } else {
1278 RTLFn =
1279 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1280 }
1281 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1282 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1283 LLVMContext &Ctx = F->getContext();
1284 MDBuilder MDB(Ctx);
1285 // Annotate the callback behavior of the __kmpc_fork_call:
1286 // - The callback callee is argument number 2 (microtask).
1287 // - The first two arguments of the callback callee are unknown (-1).
1288 // - All variadic arguments to the __kmpc_fork_call are passed to the
1289 // callback callee.
1290 F->addMetadata(LLVMContext::MD_callback,
1292 2, {-1, -1},
1293 /* VarArgsArePassed */ true)}));
1294 }
1295 }
1296 // Add some known attributes.
1297 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1298 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1299 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1300
1301 assert(OutlinedFn.arg_size() >= 2 &&
1302 "Expected at least tid and bounded tid as arguments");
1303 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1304
1305 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1306 CI->getParent()->setName("omp_parallel");
1307 Builder.SetInsertPoint(CI);
1308
1309 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1310 Value *ForkCallArgs[] = {
1311 Ident, Builder.getInt32(NumCapturedVars),
1312 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1313
1314 SmallVector<Value *, 16> RealArgs;
1315 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1316 if (IfCondition) {
1317 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1318 RealArgs.push_back(Cond);
1319 }
1320 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1321
1322 // __kmpc_fork_call_if always expects a void ptr as the last argument
1323 // If there are no arguments, pass a null pointer.
1324 auto PtrTy = OMPIRBuilder->VoidPtr;
1325 if (IfCondition && NumCapturedVars == 0) {
1326 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1327 RealArgs.push_back(NullPtrValue);
1328 }
1329 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1330 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1331
1332 Builder.CreateCall(RTLFn, RealArgs);
1333
1334 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1335 << *Builder.GetInsertBlock()->getParent() << "\n");
1336
1337 // Initialize the local TID stack location with the argument value.
1338 Builder.SetInsertPoint(PrivTID);
1339 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1340 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1341 PrivTIDAddr);
1342
1343 // Remove redundant call to the outlined function.
1344 CI->eraseFromParent();
1345
1346 for (Instruction *I : ToBeDeleted) {
1347 I->eraseFromParent();
1348 }
1349}
1350
1352 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1353 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1354 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1355 omp::ProcBindKind ProcBind, bool IsCancellable) {
1356 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1357
1358 if (!updateToLocation(Loc))
1359 return Loc.IP;
1360
1361 uint32_t SrcLocStrSize;
1362 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1363 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1364 Value *ThreadID = getOrCreateThreadID(Ident);
1365 // If we generate code for the target device, we need to allocate
1366 // struct for aggregate params in the device default alloca address space.
1367 // OpenMP runtime requires that the params of the extracted functions are
1368 // passed as zero address space pointers. This flag ensures that extracted
1369 // function arguments are declared in zero address space
1370 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1371
1372 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1373 // only if we compile for host side.
1374 if (NumThreads && !Config.isTargetDevice()) {
1375 Value *Args[] = {
1376 Ident, ThreadID,
1377 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1379 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1380 }
1381
1382 if (ProcBind != OMP_PROC_BIND_default) {
1383 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1384 Value *Args[] = {
1385 Ident, ThreadID,
1386 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1388 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1389 }
1390
1391 BasicBlock *InsertBB = Builder.GetInsertBlock();
1392 Function *OuterFn = InsertBB->getParent();
1393
1394 // Save the outer alloca block because the insertion iterator may get
1395 // invalidated and we still need this later.
1396 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1397
1398 // Vector to remember instructions we used only during the modeling but which
1399 // we want to delete at the end.
1401
1402 // Change the location to the outer alloca insertion point to create and
1403 // initialize the allocas we pass into the parallel region.
1404 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1405 Builder.restoreIP(NewOuter);
1406 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1407 AllocaInst *ZeroAddrAlloca =
1408 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1409 Instruction *TIDAddr = TIDAddrAlloca;
1410 Instruction *ZeroAddr = ZeroAddrAlloca;
1411 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1412 // Add additional casts to enforce pointers in zero address space
1413 TIDAddr = new AddrSpaceCastInst(
1414 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1415 TIDAddr->insertAfter(TIDAddrAlloca);
1416 ToBeDeleted.push_back(TIDAddr);
1417 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1418 PointerType ::get(M.getContext(), 0),
1419 "zero.addr.ascast");
1420 ZeroAddr->insertAfter(ZeroAddrAlloca);
1421 ToBeDeleted.push_back(ZeroAddr);
1422 }
1423
1424 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1425 // associated arguments in the outlined function, so we delete them later.
1426 ToBeDeleted.push_back(TIDAddrAlloca);
1427 ToBeDeleted.push_back(ZeroAddrAlloca);
1428
1429 // Create an artificial insertion point that will also ensure the blocks we
1430 // are about to split are not degenerated.
1431 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1432
1433 BasicBlock *EntryBB = UI->getParent();
1434 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1435 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1436 BasicBlock *PRegPreFiniBB =
1437 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1438 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1439
1440 auto FiniCBWrapper = [&](InsertPointTy IP) {
1441 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1442 // target to the region exit block.
1443 if (IP.getBlock()->end() == IP.getPoint()) {
1445 Builder.restoreIP(IP);
1446 Instruction *I = Builder.CreateBr(PRegExitBB);
1447 IP = InsertPointTy(I->getParent(), I->getIterator());
1448 }
1449 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1450 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1451 "Unexpected insertion point for finalization call!");
1452 return FiniCB(IP);
1453 };
1454
1455 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1456
1457 // Generate the privatization allocas in the block that will become the entry
1458 // of the outlined function.
1459 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1460 InsertPointTy InnerAllocaIP = Builder.saveIP();
1461
1462 AllocaInst *PrivTIDAddr =
1463 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1464 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1465
1466 // Add some fake uses for OpenMP provided arguments.
1467 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1468 Instruction *ZeroAddrUse =
1469 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1470 ToBeDeleted.push_back(ZeroAddrUse);
1471
1472 // EntryBB
1473 // |
1474 // V
1475 // PRegionEntryBB <- Privatization allocas are placed here.
1476 // |
1477 // V
1478 // PRegionBodyBB <- BodeGen is invoked here.
1479 // |
1480 // V
1481 // PRegPreFiniBB <- The block we will start finalization from.
1482 // |
1483 // V
1484 // PRegionExitBB <- A common exit to simplify block collection.
1485 //
1486
1487 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1488
1489 // Let the caller create the body.
1490 assert(BodyGenCB && "Expected body generation callback!");
1491 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1492 BodyGenCB(InnerAllocaIP, CodeGenIP);
1493
1494 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1495
1496 OutlineInfo OI;
1497 if (Config.isTargetDevice()) {
1498 // Generate OpenMP target specific runtime call
1499 OI.PostOutlineCB = [=, ToBeDeletedVec =
1500 std::move(ToBeDeleted)](Function &OutlinedFn) {
1501 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1502 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1503 ThreadID, ToBeDeletedVec);
1504 };
1505 } else {
1506 // Generate OpenMP host runtime call
1507 OI.PostOutlineCB = [=, ToBeDeletedVec =
1508 std::move(ToBeDeleted)](Function &OutlinedFn) {
1509 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1510 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1511 };
1512 }
1513
1514 OI.OuterAllocaBB = OuterAllocaBlock;
1515 OI.EntryBB = PRegEntryBB;
1516 OI.ExitBB = PRegExitBB;
1517
1518 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1520 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1521
1522 // Ensure a single exit node for the outlined region by creating one.
1523 // We might have multiple incoming edges to the exit now due to finalizations,
1524 // e.g., cancel calls that cause the control flow to leave the region.
1525 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1526 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1527 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1528 Blocks.push_back(PRegOutlinedExitBB);
1529
1530 CodeExtractorAnalysisCache CEAC(*OuterFn);
1531 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1532 /* AggregateArgs */ false,
1533 /* BlockFrequencyInfo */ nullptr,
1534 /* BranchProbabilityInfo */ nullptr,
1535 /* AssumptionCache */ nullptr,
1536 /* AllowVarArgs */ true,
1537 /* AllowAlloca */ true,
1538 /* AllocationBlock */ OuterAllocaBlock,
1539 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1540
1541 // Find inputs to, outputs from the code region.
1542 BasicBlock *CommonExit = nullptr;
1543 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1544 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1545 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1546
1547 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1548
1549 FunctionCallee TIDRTLFn =
1550 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1551
1552 auto PrivHelper = [&](Value &V) {
1553 if (&V == TIDAddr || &V == ZeroAddr) {
1554 OI.ExcludeArgsFromAggregate.push_back(&V);
1555 return;
1556 }
1557
1559 for (Use &U : V.uses())
1560 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1561 if (ParallelRegionBlockSet.count(UserI->getParent()))
1562 Uses.insert(&U);
1563
1564 // __kmpc_fork_call expects extra arguments as pointers. If the input
1565 // already has a pointer type, everything is fine. Otherwise, store the
1566 // value onto stack and load it back inside the to-be-outlined region. This
1567 // will ensure only the pointer will be passed to the function.
1568 // FIXME: if there are more than 15 trailing arguments, they must be
1569 // additionally packed in a struct.
1570 Value *Inner = &V;
1571 if (!V.getType()->isPointerTy()) {
1573 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1574
1575 Builder.restoreIP(OuterAllocaIP);
1576 Value *Ptr =
1577 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1578
1579 // Store to stack at end of the block that currently branches to the entry
1580 // block of the to-be-outlined region.
1581 Builder.SetInsertPoint(InsertBB,
1582 InsertBB->getTerminator()->getIterator());
1583 Builder.CreateStore(&V, Ptr);
1584
1585 // Load back next to allocations in the to-be-outlined region.
1586 Builder.restoreIP(InnerAllocaIP);
1587 Inner = Builder.CreateLoad(V.getType(), Ptr);
1588 }
1589
1590 Value *ReplacementValue = nullptr;
1591 CallInst *CI = dyn_cast<CallInst>(&V);
1592 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1593 ReplacementValue = PrivTID;
1594 } else {
1596 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1597 InnerAllocaIP = {
1598 InnerAllocaIP.getBlock(),
1599 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1600
1601 assert(ReplacementValue &&
1602 "Expected copy/create callback to set replacement value!");
1603 if (ReplacementValue == &V)
1604 return;
1605 }
1606
1607 for (Use *UPtr : Uses)
1608 UPtr->set(ReplacementValue);
1609 };
1610
1611 // Reset the inner alloca insertion as it will be used for loading the values
1612 // wrapped into pointers before passing them into the to-be-outlined region.
1613 // Configure it to insert immediately after the fake use of zero address so
1614 // that they are available in the generated body and so that the
1615 // OpenMP-related values (thread ID and zero address pointers) remain leading
1616 // in the argument list.
1617 InnerAllocaIP = IRBuilder<>::InsertPoint(
1618 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1619
1620 // Reset the outer alloca insertion point to the entry of the relevant block
1621 // in case it was invalidated.
1622 OuterAllocaIP = IRBuilder<>::InsertPoint(
1623 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1624
1625 for (Value *Input : Inputs) {
1626 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1627 PrivHelper(*Input);
1628 }
1629 LLVM_DEBUG({
1630 for (Value *Output : Outputs)
1631 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1632 });
1633 assert(Outputs.empty() &&
1634 "OpenMP outlining should not produce live-out values!");
1635
1636 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1637 LLVM_DEBUG({
1638 for (auto *BB : Blocks)
1639 dbgs() << " PBR: " << BB->getName() << "\n";
1640 });
1641
1642 // Adjust the finalization stack, verify the adjustment, and call the
1643 // finalize function a last time to finalize values between the pre-fini
1644 // block and the exit block if we left the parallel "the normal way".
1645 auto FiniInfo = FinalizationStack.pop_back_val();
1646 (void)FiniInfo;
1647 assert(FiniInfo.DK == OMPD_parallel &&
1648 "Unexpected finalization stack state!");
1649
1650 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1651
1652 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1653 FiniCB(PreFiniIP);
1654
1655 // Register the outlined info.
1656 addOutlineInfo(std::move(OI));
1657
1658 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1659 UI->eraseFromParent();
1660
1661 return AfterIP;
1662}
1663
1665 // Build call void __kmpc_flush(ident_t *loc)
1666 uint32_t SrcLocStrSize;
1667 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1668 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1669
1670 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1671}
1672
1674 if (!updateToLocation(Loc))
1675 return;
1676 emitFlush(Loc);
1677}
1678
1680 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1681 // global_tid);
1682 uint32_t SrcLocStrSize;
1683 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1684 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1685 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1686
1687 // Ignore return result until untied tasks are supported.
1688 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1689 Args);
1690}
1691
1693 if (!updateToLocation(Loc))
1694 return;
1695 emitTaskwaitImpl(Loc);
1696}
1697
1699 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1700 uint32_t SrcLocStrSize;
1701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1702 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1704 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1705
1706 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1707 Args);
1708}
1709
1711 if (!updateToLocation(Loc))
1712 return;
1713 emitTaskyieldImpl(Loc);
1714}
1715
1716// Processes the dependencies in Dependencies and does the following
1717// - Allocates space on the stack of an array of DependInfo objects
1718// - Populates each DependInfo object with relevant information of
1719// the corresponding dependence.
1720// - All code is inserted in the entry block of the current function.
1722 OpenMPIRBuilder &OMPBuilder,
1724 // Early return if we have no dependencies to process
1725 if (Dependencies.empty())
1726 return nullptr;
1727
1728 // Given a vector of DependData objects, in this function we create an
1729 // array on the stack that holds kmp_dep_info objects corresponding
1730 // to each dependency. This is then passed to the OpenMP runtime.
1731 // For example, if there are 'n' dependencies then the following psedo
1732 // code is generated. Assume the first dependence is on a variable 'a'
1733 //
1734 // \code{c}
1735 // DepArray = alloc(n x sizeof(kmp_depend_info);
1736 // idx = 0;
1737 // DepArray[idx].base_addr = ptrtoint(&a);
1738 // DepArray[idx].len = 8;
1739 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1740 // ++idx;
1741 // DepArray[idx].base_addr = ...;
1742 // \endcode
1743
1744 IRBuilderBase &Builder = OMPBuilder.Builder;
1745 Type *DependInfo = OMPBuilder.DependInfo;
1746 Module &M = OMPBuilder.M;
1747
1748 Value *DepArray = nullptr;
1749 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1750 Builder.SetInsertPoint(
1752
1753 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1754 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1755
1756 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1757 Value *Base =
1758 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1759 // Store the pointer to the variable
1760 Value *Addr = Builder.CreateStructGEP(
1761 DependInfo, Base,
1762 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1763 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1764 Builder.CreateStore(DepValPtr, Addr);
1765 // Store the size of the variable
1766 Value *Size = Builder.CreateStructGEP(
1767 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1768 Builder.CreateStore(
1769 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1770 Size);
1771 // Store the dependency kind
1772 Value *Flags = Builder.CreateStructGEP(
1773 DependInfo, Base,
1774 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1775 Builder.CreateStore(
1776 ConstantInt::get(Builder.getInt8Ty(),
1777 static_cast<unsigned int>(Dep.DepKind)),
1778 Flags);
1779 }
1780 Builder.restoreIP(OldIP);
1781 return DepArray;
1782}
1783
1786 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1787 bool Tied, Value *Final, Value *IfCondition,
1788 SmallVector<DependData> Dependencies) {
1789
1790 if (!updateToLocation(Loc))
1791 return InsertPointTy();
1792
1793 uint32_t SrcLocStrSize;
1794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1795 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1796 // The current basic block is split into four basic blocks. After outlining,
1797 // they will be mapped as follows:
1798 // ```
1799 // def current_fn() {
1800 // current_basic_block:
1801 // br label %task.exit
1802 // task.exit:
1803 // ; instructions after task
1804 // }
1805 // def outlined_fn() {
1806 // task.alloca:
1807 // br label %task.body
1808 // task.body:
1809 // ret void
1810 // }
1811 // ```
1812 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1813 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1814 BasicBlock *TaskAllocaBB =
1815 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1816
1817 InsertPointTy TaskAllocaIP =
1818 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1819 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1820 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1821
1822 OutlineInfo OI;
1823 OI.EntryBB = TaskAllocaBB;
1824 OI.OuterAllocaBB = AllocaIP.getBlock();
1825 OI.ExitBB = TaskExitBB;
1826
1827 // Add the thread ID argument.
1830 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1831
1832 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1833 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1834 // Replace the Stale CI by appropriate RTL function call.
1835 assert(OutlinedFn.getNumUses() == 1 &&
1836 "there must be a single user for the outlined function");
1837 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1838
1839 // HasShareds is true if any variables are captured in the outlined region,
1840 // false otherwise.
1841 bool HasShareds = StaleCI->arg_size() > 1;
1842 Builder.SetInsertPoint(StaleCI);
1843
1844 // Gather the arguments for emitting the runtime call for
1845 // @__kmpc_omp_task_alloc
1846 Function *TaskAllocFn =
1847 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1848
1849 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1850 // call.
1851 Value *ThreadID = getOrCreateThreadID(Ident);
1852
1853 // Argument - `flags`
1854 // Task is tied iff (Flags & 1) == 1.
1855 // Task is untied iff (Flags & 1) == 0.
1856 // Task is final iff (Flags & 2) == 2.
1857 // Task is not final iff (Flags & 2) == 0.
1858 // TODO: Handle the other flags.
1859 Value *Flags = Builder.getInt32(Tied);
1860 if (Final) {
1861 Value *FinalFlag =
1863 Flags = Builder.CreateOr(FinalFlag, Flags);
1864 }
1865
1866 // Argument - `sizeof_kmp_task_t` (TaskSize)
1867 // Tasksize refers to the size in bytes of kmp_task_t data structure
1868 // including private vars accessed in task.
1869 // TODO: add kmp_task_t_with_privates (privates)
1870 Value *TaskSize = Builder.getInt64(
1872
1873 // Argument - `sizeof_shareds` (SharedsSize)
1874 // SharedsSize refers to the shareds array size in the kmp_task_t data
1875 // structure.
1876 Value *SharedsSize = Builder.getInt64(0);
1877 if (HasShareds) {
1878 AllocaInst *ArgStructAlloca =
1879 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1880 assert(ArgStructAlloca &&
1881 "Unable to find the alloca instruction corresponding to arguments "
1882 "for extracted function");
1883 StructType *ArgStructType =
1884 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1885 assert(ArgStructType && "Unable to find struct type corresponding to "
1886 "arguments for extracted function");
1887 SharedsSize =
1889 }
1890 // Emit the @__kmpc_omp_task_alloc runtime call
1891 // The runtime call returns a pointer to an area where the task captured
1892 // variables must be copied before the task is run (TaskData)
1893 CallInst *TaskData = Builder.CreateCall(
1894 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1895 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1896 /*task_func=*/&OutlinedFn});
1897
1898 // Copy the arguments for outlined function
1899 if (HasShareds) {
1900 Value *Shareds = StaleCI->getArgOperand(1);
1901 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1902 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1903 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1904 SharedsSize);
1905 }
1906
1907 Value *DepArray = nullptr;
1908 if (Dependencies.size()) {
1909 InsertPointTy OldIP = Builder.saveIP();
1911 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1912
1913 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1914 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1915
1916 unsigned P = 0;
1917 for (const DependData &Dep : Dependencies) {
1918 Value *Base =
1919 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1920 // Store the pointer to the variable
1922 DependInfo, Base,
1923 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1924 Value *DepValPtr =
1926 Builder.CreateStore(DepValPtr, Addr);
1927 // Store the size of the variable
1929 DependInfo, Base,
1930 static_cast<unsigned int>(RTLDependInfoFields::Len));
1932 Dep.DepValueType)),
1933 Size);
1934 // Store the dependency kind
1936 DependInfo, Base,
1937 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1939 ConstantInt::get(Builder.getInt8Ty(),
1940 static_cast<unsigned int>(Dep.DepKind)),
1941 Flags);
1942 ++P;
1943 }
1944
1945 Builder.restoreIP(OldIP);
1946 }
1947
1948 // In the presence of the `if` clause, the following IR is generated:
1949 // ...
1950 // %data = call @__kmpc_omp_task_alloc(...)
1951 // br i1 %if_condition, label %then, label %else
1952 // then:
1953 // call @__kmpc_omp_task(...)
1954 // br label %exit
1955 // else:
1956 // ;; Wait for resolution of dependencies, if any, before
1957 // ;; beginning the task
1958 // call @__kmpc_omp_wait_deps(...)
1959 // call @__kmpc_omp_task_begin_if0(...)
1960 // call @outlined_fn(...)
1961 // call @__kmpc_omp_task_complete_if0(...)
1962 // br label %exit
1963 // exit:
1964 // ...
1965 if (IfCondition) {
1966 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1967 // terminator.
1968 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1969 Instruction *IfTerminator =
1970 Builder.GetInsertPoint()->getParent()->getTerminator();
1971 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1972 Builder.SetInsertPoint(IfTerminator);
1973 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1974 &ElseTI);
1975 Builder.SetInsertPoint(ElseTI);
1976
1977 if (Dependencies.size()) {
1978 Function *TaskWaitFn =
1979 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1981 TaskWaitFn,
1982 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1983 ConstantInt::get(Builder.getInt32Ty(), 0),
1985 }
1986 Function *TaskBeginFn =
1987 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1988 Function *TaskCompleteFn =
1989 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1990 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1991 CallInst *CI = nullptr;
1992 if (HasShareds)
1993 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1994 else
1995 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1996 CI->setDebugLoc(StaleCI->getDebugLoc());
1997 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1998 Builder.SetInsertPoint(ThenTI);
1999 }
2000
2001 if (Dependencies.size()) {
2002 Function *TaskFn =
2003 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2005 TaskFn,
2006 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2007 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2009
2010 } else {
2011 // Emit the @__kmpc_omp_task runtime call to spawn the task
2012 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2013 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2014 }
2015
2016 StaleCI->eraseFromParent();
2017
2018 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2019 if (HasShareds) {
2020 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2021 OutlinedFn.getArg(1)->replaceUsesWithIf(
2022 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2023 }
2024
2025 llvm::for_each(llvm::reverse(ToBeDeleted),
2026 [](Instruction *I) { I->eraseFromParent(); });
2027 };
2028
2029 addOutlineInfo(std::move(OI));
2030 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2031
2032 return Builder.saveIP();
2033}
2034
2037 InsertPointTy AllocaIP,
2038 BodyGenCallbackTy BodyGenCB) {
2039 if (!updateToLocation(Loc))
2040 return InsertPointTy();
2041
2042 uint32_t SrcLocStrSize;
2043 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2044 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2045 Value *ThreadID = getOrCreateThreadID(Ident);
2046
2047 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2048 Function *TaskgroupFn =
2049 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2050 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2051
2052 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2053 BodyGenCB(AllocaIP, Builder.saveIP());
2054
2055 Builder.SetInsertPoint(TaskgroupExitBB);
2056 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2057 Function *EndTaskgroupFn =
2058 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2059 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2060
2061 return Builder.saveIP();
2062}
2063
2065 const LocationDescription &Loc, InsertPointTy AllocaIP,
2067 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2068 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2069
2070 if (!updateToLocation(Loc))
2071 return Loc.IP;
2072
2073 auto FiniCBWrapper = [&](InsertPointTy IP) {
2074 if (IP.getBlock()->end() != IP.getPoint())
2075 return FiniCB(IP);
2076 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2077 // will fail because that function requires the Finalization Basic Block to
2078 // have a terminator, which is already removed by EmitOMPRegionBody.
2079 // IP is currently at cancelation block.
2080 // We need to backtrack to the condition block to fetch
2081 // the exit block and create a branch from cancelation
2082 // to exit block.
2084 Builder.restoreIP(IP);
2085 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2086 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2087 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2088 Instruction *I = Builder.CreateBr(ExitBB);
2089 IP = InsertPointTy(I->getParent(), I->getIterator());
2090 return FiniCB(IP);
2091 };
2092
2093 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2094
2095 // Each section is emitted as a switch case
2096 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2097 // -> OMP.createSection() which generates the IR for each section
2098 // Iterate through all sections and emit a switch construct:
2099 // switch (IV) {
2100 // case 0:
2101 // <SectionStmt[0]>;
2102 // break;
2103 // ...
2104 // case <NumSection> - 1:
2105 // <SectionStmt[<NumSection> - 1]>;
2106 // break;
2107 // }
2108 // ...
2109 // section_loop.after:
2110 // <FiniCB>;
2111 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2112 Builder.restoreIP(CodeGenIP);
2114 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2115 Function *CurFn = Continue->getParent();
2116 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2117
2118 unsigned CaseNumber = 0;
2119 for (auto SectionCB : SectionCBs) {
2121 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2122 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2123 Builder.SetInsertPoint(CaseBB);
2124 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2125 SectionCB(InsertPointTy(),
2126 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2127 CaseNumber++;
2128 }
2129 // remove the existing terminator from body BB since there can be no
2130 // terminators after switch/case
2131 };
2132 // Loop body ends here
2133 // LowerBound, UpperBound, and STride for createCanonicalLoop
2134 Type *I32Ty = Type::getInt32Ty(M.getContext());
2135 Value *LB = ConstantInt::get(I32Ty, 0);
2136 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2137 Value *ST = ConstantInt::get(I32Ty, 1);
2139 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2140 InsertPointTy AfterIP =
2141 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2142
2143 // Apply the finalization callback in LoopAfterBB
2144 auto FiniInfo = FinalizationStack.pop_back_val();
2145 assert(FiniInfo.DK == OMPD_sections &&
2146 "Unexpected finalization stack state!");
2147 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2148 Builder.restoreIP(AfterIP);
2149 BasicBlock *FiniBB =
2150 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2151 CB(Builder.saveIP());
2152 AfterIP = {FiniBB, FiniBB->begin()};
2153 }
2154
2155 return AfterIP;
2156}
2157
2160 BodyGenCallbackTy BodyGenCB,
2161 FinalizeCallbackTy FiniCB) {
2162 if (!updateToLocation(Loc))
2163 return Loc.IP;
2164
2165 auto FiniCBWrapper = [&](InsertPointTy IP) {
2166 if (IP.getBlock()->end() != IP.getPoint())
2167 return FiniCB(IP);
2168 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2169 // will fail because that function requires the Finalization Basic Block to
2170 // have a terminator, which is already removed by EmitOMPRegionBody.
2171 // IP is currently at cancelation block.
2172 // We need to backtrack to the condition block to fetch
2173 // the exit block and create a branch from cancelation
2174 // to exit block.
2176 Builder.restoreIP(IP);
2177 auto *CaseBB = Loc.IP.getBlock();
2178 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2179 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2180 Instruction *I = Builder.CreateBr(ExitBB);
2181 IP = InsertPointTy(I->getParent(), I->getIterator());
2182 return FiniCB(IP);
2183 };
2184
2185 Directive OMPD = Directive::OMPD_sections;
2186 // Since we are using Finalization Callback here, HasFinalize
2187 // and IsCancellable have to be true
2188 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2189 /*Conditional*/ false, /*hasFinalize*/ true,
2190 /*IsCancellable*/ true);
2191}
2192
2195 IT++;
2196 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2197}
2198
2199void OpenMPIRBuilder::emitUsed(StringRef Name,
2200 std::vector<WeakTrackingVH> &List) {
2201 if (List.empty())
2202 return;
2203
2204 // Convert List to what ConstantArray needs.
2206 UsedArray.resize(List.size());
2207 for (unsigned I = 0, E = List.size(); I != E; ++I)
2209 cast<Constant>(&*List[I]), Builder.getPtrTy());
2210
2211 if (UsedArray.empty())
2212 return;
2213 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2214
2215 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2216 ConstantArray::get(ATy, UsedArray), Name);
2217
2218 GV->setSection("llvm.metadata");
2219}
2220
2221Value *OpenMPIRBuilder::getGPUThreadID() {
2222 return Builder.CreateCall(
2224 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2225 {});
2226}
2227
2228Value *OpenMPIRBuilder::getGPUWarpSize() {
2229 return Builder.CreateCall(
2230 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2231}
2232
2233Value *OpenMPIRBuilder::getNVPTXWarpID() {
2234 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2235 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2236}
2237
2238Value *OpenMPIRBuilder::getNVPTXLaneID() {
2239 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2240 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2241 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2242 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2243 "nvptx_lane_id");
2244}
2245
2246Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2247 Type *ToType) {
2248 Type *FromType = From->getType();
2249 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2250 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2251 assert(FromSize > 0 && "From size must be greater than zero");
2252 assert(ToSize > 0 && "To size must be greater than zero");
2253 if (FromType == ToType)
2254 return From;
2255 if (FromSize == ToSize)
2256 return Builder.CreateBitCast(From, ToType);
2257 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2258 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2259 InsertPointTy SaveIP = Builder.saveIP();
2260 Builder.restoreIP(AllocaIP);
2261 Value *CastItem = Builder.CreateAlloca(ToType);
2262 Builder.restoreIP(SaveIP);
2263
2265 CastItem, FromType->getPointerTo());
2266 Builder.CreateStore(From, ValCastItem);
2267 return Builder.CreateLoad(ToType, CastItem);
2268}
2269
2270Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2271 Value *Element,
2272 Type *ElementType,
2273 Value *Offset) {
2274 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2275 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2276
2277 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2278 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2279 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2280 Value *WarpSize =
2281 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2283 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2284 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2285 Value *WarpSizeCast =
2286 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2287 Value *ShuffleCall =
2288 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2289 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2290}
2291
2292void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2293 Value *DstAddr, Type *ElemType,
2294 Value *Offset, Type *ReductionArrayTy) {
2296 // Create the loop over the big sized data.
2297 // ptr = (void*)Elem;
2298 // ptrEnd = (void*) Elem + 1;
2299 // Step = 8;
2300 // while (ptr + Step < ptrEnd)
2301 // shuffle((int64_t)*ptr);
2302 // Step = 4;
2303 // while (ptr + Step < ptrEnd)
2304 // shuffle((int32_t)*ptr);
2305 // ...
2306 Type *IndexTy = Builder.getIndexTy(
2308 Value *ElemPtr = DstAddr;
2309 Value *Ptr = SrcAddr;
2310 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2311 if (Size < IntSize)
2312 continue;
2313 Type *IntType = Builder.getIntNTy(IntSize * 8);
2315 Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
2316 Value *SrcAddrGEP =
2317 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2319 ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
2320
2321 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2322 if ((Size / IntSize) > 1) {
2324 SrcAddrGEP, Builder.getPtrTy());
2325 BasicBlock *PreCondBB =
2326 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2327 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2328 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2329 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2330 emitBlock(PreCondBB, CurFunc);
2331 PHINode *PhiSrc =
2332 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2333 PhiSrc->addIncoming(Ptr, CurrentBB);
2334 PHINode *PhiDest =
2335 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2336 PhiDest->addIncoming(ElemPtr, CurrentBB);
2337 Ptr = PhiSrc;
2338 ElemPtr = PhiDest;
2339 Value *PtrDiff = Builder.CreatePtrDiff(
2340 Builder.getInt8Ty(), PtrEnd,
2343 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2344 ExitBB);
2345 emitBlock(ThenBB, CurFunc);
2346 Value *Res = createRuntimeShuffleFunction(
2347 AllocaIP,
2349 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2350 IntType, Offset);
2351 Builder.CreateAlignedStore(Res, ElemPtr,
2352 M.getDataLayout().getPrefTypeAlign(ElemType));
2353 Value *LocalPtr =
2354 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2355 Value *LocalElemPtr =
2356 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2357 PhiSrc->addIncoming(LocalPtr, ThenBB);
2358 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2359 emitBranch(PreCondBB);
2360 emitBlock(ExitBB, CurFunc);
2361 } else {
2362 Value *Res = createRuntimeShuffleFunction(
2363 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2364 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2365 Res->getType()->getScalarSizeInBits())
2366 Res = Builder.CreateTrunc(Res, ElemType);
2367 Builder.CreateStore(Res, ElemPtr);
2368 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2369 ElemPtr =
2370 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2371 }
2372 Size = Size % IntSize;
2373 }
2374}
2375
2376void OpenMPIRBuilder::emitReductionListCopy(
2377 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2378 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2379 CopyOptionsTy CopyOptions) {
2380 Type *IndexTy = Builder.getIndexTy(
2382 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2383
2384 // Iterates, element-by-element, through the source Reduce list and
2385 // make a copy.
2386 for (auto En : enumerate(ReductionInfos)) {
2387 const ReductionInfo &RI = En.value();
2388 Value *SrcElementAddr = nullptr;
2389 Value *DestElementAddr = nullptr;
2390 Value *DestElementPtrAddr = nullptr;
2391 // Should we shuffle in an element from a remote lane?
2392 bool ShuffleInElement = false;
2393 // Set to true to update the pointer in the dest Reduce list to a
2394 // newly created element.
2395 bool UpdateDestListPtr = false;
2396
2397 // Step 1.1: Get the address for the src element in the Reduce list.
2398 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2399 ReductionArrayTy, SrcBase,
2400 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2401 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2402
2403 // Step 1.2: Create a temporary to store the element in the destination
2404 // Reduce list.
2405 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2406 ReductionArrayTy, DestBase,
2407 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2408 switch (Action) {
2410 InsertPointTy CurIP = Builder.saveIP();
2411 Builder.restoreIP(AllocaIP);
2412 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2413 ".omp.reduction.element");
2414 DestAlloca->setAlignment(
2415 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2416 DestElementAddr = DestAlloca;
2417 DestElementAddr =
2418 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2419 DestElementAddr->getName() + ".ascast");
2420 Builder.restoreIP(CurIP);
2421 ShuffleInElement = true;
2422 UpdateDestListPtr = true;
2423 break;
2424 }
2426 DestElementAddr =
2427 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2428 break;
2429 }
2430 }
2431
2432 // Now that all active lanes have read the element in the
2433 // Reduce list, shuffle over the value from the remote lane.
2434 if (ShuffleInElement) {
2435 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2436 RemoteLaneOffset, ReductionArrayTy);
2437 } else {
2438 switch (RI.EvaluationKind) {
2439 case EvalKind::Scalar: {
2440 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2441 // Store the source element value to the dest element address.
2442 Builder.CreateStore(Elem, DestElementAddr);
2443 break;
2444 }
2445 case EvalKind::Complex: {
2447 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2448 Value *SrcReal = Builder.CreateLoad(
2449 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2451 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2452 Value *SrcImg = Builder.CreateLoad(
2453 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2454
2456 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2458 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2459 Builder.CreateStore(SrcReal, DestRealPtr);
2460 Builder.CreateStore(SrcImg, DestImgPtr);
2461 break;
2462 }
2463 case EvalKind::Aggregate: {
2464 Value *SizeVal = Builder.getInt64(
2465 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2467 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2468 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2469 SizeVal, false);
2470 break;
2471 }
2472 };
2473 }
2474
2475 // Step 3.1: Modify reference in dest Reduce list as needed.
2476 // Modifying the reference in Reduce list to point to the newly
2477 // created element. The element is live in the current function
2478 // scope and that of functions it invokes (i.e., reduce_function).
2479 // RemoteReduceData[i] = (void*)&RemoteElem
2480 if (UpdateDestListPtr) {
2482 DestElementAddr, Builder.getPtrTy(),
2483 DestElementAddr->getName() + ".ascast");
2484 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2485 }
2486 }
2487}
2488
2489Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
2490 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2491 AttributeList FuncAttrs) {
2492 InsertPointTy SavedIP = Builder.saveIP();
2493 LLVMContext &Ctx = M.getContext();
2495 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2496 /* IsVarArg */ false);
2497 Function *WcFunc =
2499 "_omp_reduction_inter_warp_copy_func", &M);
2500 WcFunc->setAttributes(FuncAttrs);
2501 WcFunc->addParamAttr(0, Attribute::NoUndef);
2502 WcFunc->addParamAttr(1, Attribute::NoUndef);
2503 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2504 Builder.SetInsertPoint(EntryBB);
2505
2506 // ReduceList: thread local Reduce list.
2507 // At the stage of the computation when this function is called, partially
2508 // aggregated values reside in the first lane of every active warp.
2509 Argument *ReduceListArg = WcFunc->getArg(0);
2510 // NumWarps: number of warps active in the parallel region. This could
2511 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2512 Argument *NumWarpsArg = WcFunc->getArg(1);
2513
2514 // This array is used as a medium to transfer, one reduce element at a time,
2515 // the data from the first lane of every warp to lanes in the first warp
2516 // in order to perform the final step of a reduction in a parallel region
2517 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2518 // for reduced latency, as well as to have a distinct copy for concurrently
2519 // executing target regions. The array is declared with common linkage so
2520 // as to be shared across compilation units.
2521 StringRef TransferMediumName =
2522 "__openmp_nvptx_data_transfer_temporary_storage";
2523 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2524 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2525 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2526 if (!TransferMedium) {
2527 TransferMedium = new GlobalVariable(
2528 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2529 UndefValue::get(ArrayTy), TransferMediumName,
2530 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2531 /*AddressSpace=*/3);
2532 }
2533
2534 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2535 Value *GPUThreadID = getGPUThreadID();
2536 // nvptx_lane_id = nvptx_id % warpsize
2537 Value *LaneID = getNVPTXLaneID();
2538 // nvptx_warp_id = nvptx_id / warpsize
2539 Value *WarpID = getNVPTXWarpID();
2540
2541 InsertPointTy AllocaIP =
2544 Type *Arg0Type = ReduceListArg->getType();
2545 Type *Arg1Type = NumWarpsArg->getType();
2546 Builder.restoreIP(AllocaIP);
2547 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2548 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2549 AllocaInst *NumWarpsAlloca =
2550 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2552 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2554 NumWarpsAlloca, Arg1Type->getPointerTo(),
2555 NumWarpsAlloca->getName() + ".ascast");
2556 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2557 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2558 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2559 InsertPointTy CodeGenIP =
2561 Builder.restoreIP(CodeGenIP);
2562
2563 Value *ReduceList =
2564 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2565
2566 for (auto En : enumerate(ReductionInfos)) {
2567 //
2568 // Warp master copies reduce element to transfer medium in __shared__
2569 // memory.
2570 //
2571 const ReductionInfo &RI = En.value();
2572 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2573 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2574 Type *CType = Builder.getIntNTy(TySize * 8);
2575
2576 unsigned NumIters = RealTySize / TySize;
2577 if (NumIters == 0)
2578 continue;
2579 Value *Cnt = nullptr;
2580 Value *CntAddr = nullptr;
2581 BasicBlock *PrecondBB = nullptr;
2582 BasicBlock *ExitBB = nullptr;
2583 if (NumIters > 1) {
2584 CodeGenIP = Builder.saveIP();
2585 Builder.restoreIP(AllocaIP);
2586 CntAddr =
2587 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2588
2589 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2590 CntAddr->getName() + ".ascast");
2591 Builder.restoreIP(CodeGenIP);
2593 CntAddr,
2594 /*Volatile=*/false);
2595 PrecondBB = BasicBlock::Create(Ctx, "precond");
2596 ExitBB = BasicBlock::Create(Ctx, "exit");
2597 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2598 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2599 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2600 /*Volatile=*/false);
2602 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2603 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2605 }
2606
2607 // kmpc_barrier.
2608 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2609 omp::Directive::OMPD_unknown,
2610 /* ForceSimpleCall */ false,
2611 /* CheckCancelFlag */ true);
2612 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2613 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2614 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2615
2616 // if (lane_id == 0)
2617 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2618 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2620
2621 // Reduce element = LocalReduceList[i]
2622 auto *RedListArrayTy =
2623 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2624 Type *IndexTy = Builder.getIndexTy(
2626 Value *ElemPtrPtr =
2627 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2628 {ConstantInt::get(IndexTy, 0),
2629 ConstantInt::get(IndexTy, En.index())});
2630 // elemptr = ((CopyType*)(elemptrptr)) + I
2631 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2632 if (NumIters > 1)
2633 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2634
2635 // Get pointer to location in transfer medium.
2636 // MediumPtr = &medium[warp_id]
2637 Value *MediumPtr = Builder.CreateInBoundsGEP(
2638 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2639 // elem = *elemptr
2640 //*MediumPtr = elem
2641 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2642 // Store the source element value to the dest element address.
2643 Builder.CreateStore(Elem, MediumPtr,
2644 /*IsVolatile*/ true);
2645 Builder.CreateBr(MergeBB);
2646
2647 // else
2649 Builder.CreateBr(MergeBB);
2650
2651 // endif
2653 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2654 omp::Directive::OMPD_unknown,
2655 /* ForceSimpleCall */ false,
2656 /* CheckCancelFlag */ true);
2657
2658 // Warp 0 copies reduce element from transfer medium
2659 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2660 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2661 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2662
2663 Value *NumWarpsVal =
2664 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2665 // Up to 32 threads in warp 0 are active.
2666 Value *IsActiveThread =
2667 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2668 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2669
2670 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2671
2672 // SecMediumPtr = &medium[tid]
2673 // SrcMediumVal = *SrcMediumPtr
2674 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2675 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2676 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2677 Value *TargetElemPtrPtr =
2678 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2679 {ConstantInt::get(IndexTy, 0),
2680 ConstantInt::get(IndexTy, En.index())});
2681 Value *TargetElemPtrVal =
2682 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2683 Value *TargetElemPtr = TargetElemPtrVal;
2684 if (NumIters > 1)
2685 TargetElemPtr =
2686 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2687
2688 // *TargetElemPtr = SrcMediumVal;
2689 Value *SrcMediumValue =
2690 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2691 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2692 Builder.CreateBr(W0MergeBB);
2693
2694 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2695 Builder.CreateBr(W0MergeBB);
2696
2697 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2698
2699 if (NumIters > 1) {
2700 Cnt = Builder.CreateNSWAdd(
2701 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2702 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2703
2704 auto *CurFn = Builder.GetInsertBlock()->getParent();
2705 emitBranch(PrecondBB);
2706 emitBlock(ExitBB, CurFn);
2707 }
2708 RealTySize %= TySize;
2709 }
2710 }
2711
2713 Builder.restoreIP(SavedIP);
2714
2715 return WcFunc;
2716}
2717
2718Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2719 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2720 AttributeList FuncAttrs) {
2721 LLVMContext &Ctx = M.getContext();
2722 FunctionType *FuncTy =
2724 {Builder.getPtrTy(), Builder.getInt16Ty(),
2725 Builder.getInt16Ty(), Builder.getInt16Ty()},
2726 /* IsVarArg */ false);
2727 Function *SarFunc =
2729 "_omp_reduction_shuffle_and_reduce_func", &M);
2730 SarFunc->setAttributes(FuncAttrs);
2731 SarFunc->addParamAttr(0, Attribute::NoUndef);
2732 SarFunc->addParamAttr(1, Attribute::NoUndef);
2733 SarFunc->addParamAttr(2, Attribute::NoUndef);
2734 SarFunc->addParamAttr(3, Attribute::NoUndef);
2735 SarFunc->addParamAttr(1, Attribute::SExt);
2736 SarFunc->addParamAttr(2, Attribute::SExt);
2737 SarFunc->addParamAttr(3, Attribute::SExt);
2738 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2739 Builder.SetInsertPoint(EntryBB);
2740
2741 // Thread local Reduce list used to host the values of data to be reduced.
2742 Argument *ReduceListArg = SarFunc->getArg(0);
2743 // Current lane id; could be logical.
2744 Argument *LaneIDArg = SarFunc->getArg(1);
2745 // Offset of the remote source lane relative to the current lane.
2746 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2747 // Algorithm version. This is expected to be known at compile time.
2748 Argument *AlgoVerArg = SarFunc->getArg(3);
2749
2750 Type *ReduceListArgType = ReduceListArg->getType();
2751 Type *LaneIDArgType = LaneIDArg->getType();
2752 Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
2753 Value *ReduceListAlloca = Builder.CreateAlloca(
2754 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2755 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2756 LaneIDArg->getName() + ".addr");
2757 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2758 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2759 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2760 AlgoVerArg->getName() + ".addr");
2761 ArrayType *RedListArrayTy =
2762 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2763
2764 // Create a local thread-private variable to host the Reduce list
2765 // from a remote lane.
2766 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2767 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2768
2770 ReduceListAlloca, ReduceListArgType,
2771 ReduceListAlloca->getName() + ".ascast");
2773 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2774 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2775 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2776 RemoteLaneOffsetAlloca->getName() + ".ascast");
2778 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2780 RemoteReductionListAlloca, Builder.getPtrTy(),
2781 RemoteReductionListAlloca->getName() + ".ascast");
2782
2783 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2784 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2785 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2786 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2787
2788 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2789 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2790 Value *RemoteLaneOffset =
2791 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2792 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2793
2794 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2795
2796 // This loop iterates through the list of reduce elements and copies,
2797 // element by element, from a remote lane in the warp to RemoteReduceList,
2798 // hosted on the thread's stack.
2799 emitReductionListCopy(
2800 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2801 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2802
2803 // The actions to be performed on the Remote Reduce list is dependent
2804 // on the algorithm version.
2805 //
2806 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2807 // LaneId % 2 == 0 && Offset > 0):
2808 // do the reduction value aggregation
2809 //
2810 // The thread local variable Reduce list is mutated in place to host the
2811 // reduced data, which is the aggregated value produced from local and
2812 // remote lanes.
2813 //
2814 // Note that AlgoVer is expected to be a constant integer known at compile
2815 // time.
2816 // When AlgoVer==0, the first conjunction evaluates to true, making
2817 // the entire predicate true during compile time.
2818 // When AlgoVer==1, the second conjunction has only the second part to be
2819 // evaluated during runtime. Other conjunctions evaluates to false
2820 // during compile time.
2821 // When AlgoVer==2, the third conjunction has only the second part to be
2822 // evaluated during runtime. Other conjunctions evaluates to false
2823 // during compile time.
2824 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2825 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2826 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2827 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2828 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2829 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2830 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2831 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2832 Value *RemoteOffsetComp =
2833 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2834 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2835 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2836 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2837
2838 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2839 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2840 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2841
2842 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2845 ReduceList, Builder.getPtrTy());
2846 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2847 RemoteListAddrCast, Builder.getPtrTy());
2848 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2849 ->addFnAttr(Attribute::NoUnwind);
2850 Builder.CreateBr(MergeBB);
2851
2853 Builder.CreateBr(MergeBB);
2854
2856
2857 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2858 // Reduce list.
2859 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2860 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2861 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2862
2863 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2864 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2865 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2866 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2867
2868 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2869 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2870 ReductionInfos, RemoteListAddrCast, ReduceList);
2871 Builder.CreateBr(CpyMergeBB);
2872
2873 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2874 Builder.CreateBr(CpyMergeBB);
2875
2876 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2877
2879
2880 return SarFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2885 AttributeList FuncAttrs) {
2887 LLVMContext &Ctx = M.getContext();
2890 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2891 /* IsVarArg */ false);
2892 Function *LtGCFunc =
2894 "_omp_reduction_list_to_global_copy_func", &M);
2895 LtGCFunc->setAttributes(FuncAttrs);
2896 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2897 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2898 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2899
2900 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2901 Builder.SetInsertPoint(EntryBlock);
2902
2903 // Buffer: global reduction buffer.
2904 Argument *BufferArg = LtGCFunc->getArg(0);
2905 // Idx: index of the buffer.
2906 Argument *IdxArg = LtGCFunc->getArg(1);
2907 // ReduceList: thread local Reduce list.
2908 Argument *ReduceListArg = LtGCFunc->getArg(2);
2909
2910 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2911 BufferArg->getName() + ".addr");
2912 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2913 IdxArg->getName() + ".addr");
2914 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2915 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2917 BufferArgAlloca, Builder.getPtrTy(),
2918 BufferArgAlloca->getName() + ".ascast");
2920 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2921 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2922 ReduceListArgAlloca, Builder.getPtrTy(),
2923 ReduceListArgAlloca->getName() + ".ascast");
2924
2925 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2926 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2927 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2928
2929 Value *LocalReduceList =
2930 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2931 Value *BufferArgVal =
2932 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2933 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2934 Type *IndexTy = Builder.getIndexTy(
2936 for (auto En : enumerate(ReductionInfos)) {
2937 const ReductionInfo &RI = En.value();
2938 auto *RedListArrayTy =
2939 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2940 // Reduce element = LocalReduceList[i]
2941 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
2942 RedListArrayTy, LocalReduceList,
2943 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2944 // elemptr = ((CopyType*)(elemptrptr)) + I
2945 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2946
2947 // Global = Buffer.VD[Idx];
2948 Value *BufferVD =
2949 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
2951 ReductionsBufferTy, BufferVD, 0, En.index());
2952
2953 switch (RI.EvaluationKind) {
2954 case EvalKind::Scalar: {
2955 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
2956 Builder.CreateStore(TargetElement, GlobVal);
2957 break;
2958 }
2959 case EvalKind::Complex: {
2961 RI.ElementType, ElemPtr, 0, 0, ".realp");
2962 Value *SrcReal = Builder.CreateLoad(
2963 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2965 RI.ElementType, ElemPtr, 0, 1, ".imagp");
2966 Value *SrcImg = Builder.CreateLoad(
2967 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2968
2970 RI.ElementType, GlobVal, 0, 0, ".realp");
2972 RI.ElementType, GlobVal, 0, 1, ".imagp");
2973 Builder.CreateStore(SrcReal, DestRealPtr);
2974 Builder.CreateStore(SrcImg, DestImgPtr);
2975 break;
2976 }
2977 case EvalKind::Aggregate: {
2978 Value *SizeVal =
2979 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
2981 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
2982 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
2983 break;
2984 }
2985 }
2986 }
2987
2989 Builder.restoreIP(OldIP);
2990 return LtGCFunc;
2991}
2992
2993Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
2994 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2995 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
2997 LLVMContext &Ctx = M.getContext();
3000 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3001 /* IsVarArg */ false);
3002 Function *LtGRFunc =
3004 "_omp_reduction_list_to_global_reduce_func", &M);
3005 LtGRFunc->setAttributes(FuncAttrs);
3006 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3007 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3008 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3009
3010 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3011 Builder.SetInsertPoint(EntryBlock);
3012
3013 // Buffer: global reduction buffer.
3014 Argument *BufferArg = LtGRFunc->getArg(0);
3015 // Idx: index of the buffer.
3016 Argument *IdxArg = LtGRFunc->getArg(1);
3017 // ReduceList: thread local Reduce list.
3018 Argument *ReduceListArg = LtGRFunc->getArg(2);
3019
3020 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3021 BufferArg->getName() + ".addr");
3022 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3023 IdxArg->getName() + ".addr");
3024 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3025 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3026 auto *RedListArrayTy =
3027 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3028
3029 // 1. Build a list of reduction variables.
3030 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3031 Value *LocalReduceList =
3032 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3033
3035 BufferArgAlloca, Builder.getPtrTy(),
3036 BufferArgAlloca->getName() + ".ascast");
3038 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3039 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3040 ReduceListArgAlloca, Builder.getPtrTy(),
3041 ReduceListArgAlloca->getName() + ".ascast");
3042 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3043 LocalReduceList, Builder.getPtrTy(),
3044 LocalReduceList->getName() + ".ascast");
3045
3046 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3047 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3048 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3049
3050 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3051 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3052 Type *IndexTy = Builder.getIndexTy(
3054 for (auto En : enumerate(ReductionInfos)) {
3055 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3056 RedListArrayTy, LocalReduceListAddrCast,
3057 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3058 Value *BufferVD =
3059 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3060 // Global = Buffer.VD[Idx];
3062 ReductionsBufferTy, BufferVD, 0, En.index());
3063 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3064 }
3065
3066 // Call reduce_function(GlobalReduceList, ReduceList)
3067 Value *ReduceList =
3068 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3069 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3070 ->addFnAttr(Attribute::NoUnwind);
3072 Builder.restoreIP(OldIP);
3073 return LtGRFunc;
3074}
3075
3076Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3077 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3078 AttributeList FuncAttrs) {
3080 LLVMContext &Ctx = M.getContext();
3083 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3084 /* IsVarArg */ false);
3085 Function *LtGCFunc =
3087 "_omp_reduction_global_to_list_copy_func", &M);
3088 LtGCFunc->setAttributes(FuncAttrs);
3089 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3090 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3091 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3092
3093 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3094 Builder.SetInsertPoint(EntryBlock);
3095
3096 // Buffer: global reduction buffer.
3097 Argument *BufferArg = LtGCFunc->getArg(0);
3098 // Idx: index of the buffer.
3099 Argument *IdxArg = LtGCFunc->getArg(1);
3100 // ReduceList: thread local Reduce list.
3101 Argument *ReduceListArg = LtGCFunc->getArg(2);
3102
3103 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3104 BufferArg->getName() + ".addr");
3105 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3106 IdxArg->getName() + ".addr");
3107 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3108 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3110 BufferArgAlloca, Builder.getPtrTy(),
3111 BufferArgAlloca->getName() + ".ascast");
3113 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3114 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3115 ReduceListArgAlloca, Builder.getPtrTy(),
3116 ReduceListArgAlloca->getName() + ".ascast");
3117 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3118 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3119 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3120
3121 Value *LocalReduceList =
3122 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3123 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3124 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3125 Type *IndexTy = Builder.getIndexTy(
3127 for (auto En : enumerate(ReductionInfos)) {
3128 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3129 auto *RedListArrayTy =
3130 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3131 // Reduce element = LocalReduceList[i]
3132 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3133 RedListArrayTy, LocalReduceList,
3134 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3135 // elemptr = ((CopyType*)(elemptrptr)) + I
3136 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3137 // Global = Buffer.VD[Idx];
3138 Value *BufferVD =
3139 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3141 ReductionsBufferTy, BufferVD, 0, En.index());
3142
3143 switch (RI.EvaluationKind) {
3144 case EvalKind::Scalar: {
3145 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3146 Builder.CreateStore(TargetElement, ElemPtr);
3147 break;
3148 }
3149 case EvalKind::Complex: {
3151 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3152 Value *SrcReal = Builder.CreateLoad(
3153 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3155 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3156 Value *SrcImg = Builder.CreateLoad(
3157 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3158
3160 RI.ElementType, ElemPtr, 0, 0, ".realp");
3162 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3163 Builder.CreateStore(SrcReal, DestRealPtr);
3164 Builder.CreateStore(SrcImg, DestImgPtr);
3165 break;
3166 }
3167 case EvalKind::Aggregate: {
3168 Value *SizeVal =
3172 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3173 SizeVal, false);
3174 break;
3175 }
3176 }
3177 }
3178
3180 Builder.restoreIP(OldIP);
3181 return LtGCFunc;
3182}
3183
3184Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3185 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3186 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3188 LLVMContext &Ctx = M.getContext();
3189 auto *FuncTy = FunctionType::get(
3191 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3192 /* IsVarArg */ false);
3193 Function *LtGRFunc =
3195 "_omp_reduction_global_to_list_reduce_func", &M);
3196 LtGRFunc->setAttributes(FuncAttrs);
3197 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3198 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3199 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3200
3201 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3202 Builder.SetInsertPoint(EntryBlock);
3203
3204 // Buffer: global reduction buffer.
3205 Argument *BufferArg = LtGRFunc->getArg(0);
3206 // Idx: index of the buffer.
3207 Argument *IdxArg = LtGRFunc->getArg(1);
3208 // ReduceList: thread local Reduce list.
3209 Argument *ReduceListArg = LtGRFunc->getArg(2);
3210
3211 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3212 BufferArg->getName() + ".addr");
3213 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3214 IdxArg->getName() + ".addr");
3215 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3216 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3217 ArrayType *RedListArrayTy =
3218 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3219
3220 // 1. Build a list of reduction variables.
3221 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3222 Value *LocalReduceList =
3223 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3224
3226 BufferArgAlloca, Builder.getPtrTy(),
3227 BufferArgAlloca->getName() + ".ascast");
3229 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3230 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3231 ReduceListArgAlloca, Builder.getPtrTy(),
3232 ReduceListArgAlloca->getName() + ".ascast");
3234 LocalReduceList, Builder.getPtrTy(),
3235 LocalReduceList->getName() + ".ascast");
3236
3237 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3238 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3239 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3240
3241 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3242 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3243 Type *IndexTy = Builder.getIndexTy(
3245 for (auto En : enumerate(ReductionInfos)) {
3246 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3247 RedListArrayTy, ReductionList,
3248 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3249 // Global = Buffer.VD[Idx];
3250 Value *BufferVD =
3251 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3253 ReductionsBufferTy, BufferVD, 0, En.index());
3254 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3255 }
3256
3257 // Call reduce_function(ReduceList, GlobalReduceList)
3258 Value *ReduceList =
3259 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3260 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3261 ->addFnAttr(Attribute::NoUnwind);
3263 Builder.restoreIP(OldIP);
3264 return LtGRFunc;
3265}
3266
3267std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3268 std::string Suffix =
3269 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3270 return (Name + Suffix).str();
3271}
3272
3273Function *OpenMPIRBuilder::createReductionFunction(
3274 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3275 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3276 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3277 {Builder.getPtrTy(), Builder.getPtrTy()},
3278 /* IsVarArg */ false);
3279 std::string Name = getReductionFuncName(ReducerName);
3280 Function *ReductionFunc =
3282 ReductionFunc->setAttributes(FuncAttrs);
3283 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3284 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3285 BasicBlock *EntryBB =
3286 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3287 Builder.SetInsertPoint(EntryBB);
3288
3289 // Need to alloca memory here and deal with the pointers before getting
3290 // LHS/RHS pointers out
3291 Value *LHSArrayPtr = nullptr;
3292 Value *RHSArrayPtr = nullptr;
3293 Argument *Arg0 = ReductionFunc->getArg(0);
3294 Argument *Arg1 = ReductionFunc->getArg(1);
3295 Type *Arg0Type = Arg0->getType();
3296 Type *Arg1Type = Arg1->getType();
3297
3298 Value *LHSAlloca =
3299 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3300 Value *RHSAlloca =
3301 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3303 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3305 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3306 Builder.CreateStore(Arg0, LHSAddrCast);
3307 Builder.CreateStore(Arg1, RHSAddrCast);
3308 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3309 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3310
3311 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3312 Type *IndexTy = Builder.getIndexTy(
3314 SmallVector<Value *> LHSPtrs, RHSPtrs;
3315 for (auto En : enumerate(ReductionInfos)) {
3316 const ReductionInfo &RI = En.value();
3317 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3318 RedArrayTy, RHSArrayPtr,
3319 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3320 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3322 RHSI8Ptr, RI.PrivateVariable->getType(),
3323 RHSI8Ptr->getName() + ".ascast");
3324
3325 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3326 RedArrayTy, LHSArrayPtr,
3327 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3328 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3330 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3331
3333 LHSPtrs.emplace_back(LHSPtr);
3334 RHSPtrs.emplace_back(RHSPtr);
3335 } else {
3336 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3337 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3338 Value *Reduced;
3339 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3340 if (!Builder.GetInsertBlock())
3341 return ReductionFunc;
3342 Builder.CreateStore(Reduced, LHSPtr);
3343 }
3344 }
3345
3347 for (auto En : enumerate(ReductionInfos)) {
3348 unsigned Index = En.index();
3349 const ReductionInfo &RI = En.value();
3350 Value *LHSFixupPtr, *RHSFixupPtr;
3351 Builder.restoreIP(RI.ReductionGenClang(
3352 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3353
3354 // Fix the CallBack code genereated to use the correct Values for the LHS
3355 // and RHS
3356 LHSFixupPtr->replaceUsesWithIf(
3357 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3358 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3359 ReductionFunc;
3360 });
3361 RHSFixupPtr->replaceUsesWithIf(
3362 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3363 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3364 ReductionFunc;
3365 });
3366 }
3367
3369 return ReductionFunc;
3370}
3371
3372static void
3374 bool IsGPU) {
3375 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3376 (void)RI;
3377 assert(RI.Variable && "expected non-null variable");
3378 assert(RI.PrivateVariable && "expected non-null private variable");
3379 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3380 "expected non-null reduction generator callback");
3381 if (!IsGPU) {
3382 assert(
3383 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3384 "expected variables and their private equivalents to have the same "
3385 "type");
3386 }
3387 assert(RI.Variable->getType()->isPointerTy() &&
3388 "expected variables to be pointers");
3389 }
3390}
3391
3393 const LocationDescription &Loc, InsertPointTy AllocaIP,
3394 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3395 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3396 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3397 unsigned ReductionBufNum, Value *SrcLocInfo) {
3398 if (!updateToLocation(Loc))
3399 return InsertPointTy();
3400 Builder.restoreIP(CodeGenIP);
3401 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3402 LLVMContext &Ctx = M.getContext();
3403
3404 // Source location for the ident struct
3405 if (!SrcLocInfo) {
3406 uint32_t SrcLocStrSize;
3407 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3408 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3409 }
3410
3411 if (ReductionInfos.size() == 0)
3412 return Builder.saveIP();
3413
3414 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3415 AttributeList FuncAttrs;
3416 AttrBuilder AttrBldr(Ctx);
3417 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3418 AttrBldr.addAttribute(Attr);
3419 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3420 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3421
3422 Function *ReductionFunc = nullptr;
3423 CodeGenIP = Builder.saveIP();
3424 ReductionFunc =
3425 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3426 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3427 Builder.restoreIP(CodeGenIP);
3428
3429 // Set the grid value in the config needed for lowering later on
3430 if (GridValue.has_value())
3431 Config.setGridValue(GridValue.value());
3432 else
3433 Config.setGridValue(getGridValue(T, ReductionFunc));
3434
3435 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3436 // RedList, shuffle_reduce_func, interwarp_copy_func);
3437 // or
3438 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3439 Value *Res;
3440
3441 // 1. Build a list of reduction variables.
3442 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3443 auto Size = ReductionInfos.size();
3444 Type *PtrTy = PointerType::getUnqual(Ctx);
3445 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3446 CodeGenIP = Builder.saveIP();
3447 Builder.restoreIP(AllocaIP);
3448 Value *ReductionListAlloca =
3449 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3451 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3452 Builder.restoreIP(CodeGenIP);
3453 Type *IndexTy = Builder.getIndexTy(
3455 for (auto En : enumerate(ReductionInfos)) {
3456 const ReductionInfo &RI = En.value();
3457 Value *ElemPtr = Builder.CreateInBoundsGEP(
3458 RedArrayTy, ReductionList,
3459 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3460 Value *CastElem =
3462 Builder.CreateStore(CastElem, ElemPtr);
3463 }
3464 CodeGenIP = Builder.saveIP();
3465 Function *SarFunc =
3466 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3467 Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3468 Builder.restoreIP(CodeGenIP);
3469
3470 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3471
3472 unsigned MaxDataSize = 0;
3473 SmallVector<Type *> ReductionTypeArgs;
3474 for (auto En : enumerate(ReductionInfos)) {
3475 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3476 if (Size > MaxDataSize)
3477 MaxDataSize = Size;
3478 ReductionTypeArgs.emplace_back(En.value().ElementType);
3479 }
3480 Value *ReductionDataSize =
3481 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3482 if (!IsTeamsReduction) {
3483 Value *SarFuncCast =
3485 Value *WcFuncCast =
3487 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3488 WcFuncCast};
3490 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3491 Res = Builder.CreateCall(Pv2Ptr, Args);
3492 } else {
3493 CodeGenIP = Builder.saveIP();
3494 StructType *ReductionsBufferTy = StructType::create(
3495 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3496 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3497 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3498 Function *LtGCFunc = emitListToGlobalCopyFunction(
3499 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3500 Function *LtGRFunc = emitListToGlobalReduceFunction(
3501 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3502 Function *GtLCFunc = emitGlobalToListCopyFunction(
3503 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3504 Function *GtLRFunc = emitGlobalToListReduceFunction(
3505 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3506 Builder.restoreIP(CodeGenIP);
3507
3508 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3509 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3510
3511 Value *Args3[] = {SrcLocInfo,
3512 KernelTeamsReductionPtr,
3513 Builder.getInt32(ReductionBufNum),
3514 ReductionDataSize,
3515 RL,
3516 SarFunc,
3517 WcFunc,
3518 LtGCFunc,
3519 LtGRFunc,
3520 GtLCFunc,
3521 GtLRFunc};
3522
3523 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3524 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3525 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3526 }
3527
3528 // 5. Build if (res == 1)
3529 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3530 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3532 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3533
3534 // 6. Build then branch: where we have reduced values in the master
3535 // thread in each team.
3536 // __kmpc_end_reduce{_nowait}(<gtid>);
3537 // break;
3538 emitBlock(ThenBB, CurFunc);
3539
3540 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3541 for (auto En : enumerate(ReductionInfos)) {
3542 const ReductionInfo &RI = En.value();
3543 Value *LHS = RI.Variable;
3544 Value *RHS =
3546
3548 Value *LHSPtr, *RHSPtr;
3550 &LHSPtr, &RHSPtr, CurFunc));
3551
3552 // Fix the CallBack code genereated to use the correct Values for the LHS
3553 // and RHS
3554 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3555 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3556 ReductionFunc;
3557 });
3558 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3559 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3560 ReductionFunc;
3561 });
3562 } else {
3563 assert(false && "Unhandled ReductionGenCBKind");
3564 }
3565 }
3566 emitBlock(ExitBB, CurFunc);
3567
3569
3570 return Builder.saveIP();
3571}
3572
3574 Type *VoidTy = Type::getVoidTy(M.getContext());
3575 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3576 auto *FuncTy =
3577 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3579 ".omp.reduction.func", &M);
3580}
3581
3584 InsertPointTy AllocaIP,
3585 ArrayRef<ReductionInfo> ReductionInfos,
3586 ArrayRef<bool> IsByRef, bool IsNoWait) {
3587 assert(ReductionInfos.size() == IsByRef.size());
3588 for (const ReductionInfo &RI : ReductionInfos) {
3589 (void)RI;
3590 assert(RI.Variable && "expected non-null variable");
3591 assert(RI.PrivateVariable && "expected non-null private variable");
3592 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3593 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3594 "expected variables and their private equivalents to have the same "
3595 "type");
3596 assert(RI.Variable->getType()->isPointerTy() &&
3597 "expected variables to be pointers");
3598 }
3599
3600 if (!updateToLocation(Loc))
3601 return InsertPointTy();
3602
3603 BasicBlock *InsertBlock = Loc.IP.getBlock();
3604 BasicBlock *ContinuationBlock =
3605 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3606 InsertBlock->getTerminator()->eraseFromParent();
3607
3608 // Create and populate array of type-erased pointers to private reduction
3609 // values.
3610 unsigned NumReductions = ReductionInfos.size();
3611 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3613 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3614
3615 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3616
3617 for (auto En : enumerate(ReductionInfos)) {
3618 unsigned Index = En.index();
3619 const ReductionInfo &RI = En.value();
3620 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3621 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3622 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3623 }
3624
3625 // Emit a call to the runtime function that orchestrates the reduction.
3626 // Declare the reduction function in the process.
3628 Module *Module = Func->getParent();
3629 uint32_t SrcLocStrSize;
3630 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3631 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3632 return RI.AtomicReductionGen;
3633 });
3634 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3635 CanGenerateAtomic
3636 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3637 : IdentFlag(0));
3638 Value *ThreadId = getOrCreateThreadID(Ident);
3639 Constant *NumVariables = Builder.getInt32(NumReductions);
3640 const DataLayout &DL = Module->getDataLayout();
3641 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3642 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3643 Function *ReductionFunc = getFreshReductionFunc(*Module);
3644 Value *Lock = getOMPCriticalRegionLock(".reduction");
3646 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3647 : RuntimeFunction::OMPRTL___kmpc_reduce);
3648 CallInst *ReduceCall =
3649 Builder.CreateCall(ReduceFunc,
3650 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3651 ReductionFunc, Lock},
3652 "reduce");
3653
3654 // Create final reduction entry blocks for the atomic and non-atomic case.
3655 // Emit IR that dispatches control flow to one of the blocks based on the
3656 // reduction supporting the atomic mode.
3657 BasicBlock *NonAtomicRedBlock =
3658 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3659 BasicBlock *AtomicRedBlock =
3660 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3661 SwitchInst *Switch =
3662 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3663 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3664 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3665
3666 // Populate the non-atomic reduction using the elementwise reduction function.
3667 // This loads the elements from the global and private variables and reduces
3668 // them before storing back the result to the global variable.
3669 Builder.SetInsertPoint(NonAtomicRedBlock);
3670 for (auto En : enumerate(ReductionInfos)) {
3671 const ReductionInfo &RI = En.value();
3673 // We have one less load for by-ref case because that load is now inside of
3674 // the reduction region
3675 Value *RedValue = nullptr;
3676 if (!IsByRef[En.index()]) {
3677 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3678 "red.value." + Twine(En.index()));
3679 }
3680 Value *PrivateRedValue =
3682 "red.private.value." + Twine(En.index()));
3683 Value *Reduced;
3684 if (IsByRef[En.index()]) {
3686 PrivateRedValue, Reduced));
3687 } else {
3689 PrivateRedValue, Reduced));
3690 }
3691 if (!Builder.GetInsertBlock())
3692 return InsertPointTy();
3693 // for by-ref case, the load is inside of the reduction region
3694 if (!IsByRef[En.index()])
3695 Builder.CreateStore(Reduced, RI.Variable);
3696 }
3697 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3698 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3699 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3700 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3701 Builder.CreateBr(ContinuationBlock);
3702
3703 // Populate the atomic reduction using the atomic elementwise reduction
3704 // function. There are no loads/stores here because they will be happening
3705 // inside the atomic elementwise reduction.
3706 Builder.SetInsertPoint(AtomicRedBlock);
3707 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3708 for (const ReductionInfo &RI : ReductionInfos) {
3710 RI.Variable, RI.PrivateVariable));
3711 if (!Builder.GetInsertBlock())
3712 return InsertPointTy();
3713 }
3714 Builder.CreateBr(ContinuationBlock);
3715 } else {
3717 }
3718
3719 // Populate the outlined reduction function using the elementwise reduction
3720 // function. Partial values are extracted from the type-erased array of
3721 // pointers to private variables.
3722 BasicBlock *ReductionFuncBlock =
3723 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3724 Builder.SetInsertPoint(ReductionFuncBlock);
3725 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3726 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3727
3728 for (auto En : enumerate(ReductionInfos)) {
3729 const ReductionInfo &RI = En.value();
3731 RedArrayTy, LHSArrayPtr, 0, En.index());
3732 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3733 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3734 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3736 RedArrayTy, RHSArrayPtr, 0, En.index());
3737 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3738 Value *RHSPtr =
3740 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3741 Value *Reduced;
3743 if (!Builder.GetInsertBlock())
3744 return InsertPointTy();
3745 // store is inside of the reduction region when using by-ref
3746 if (!IsByRef[En.index()])
3747 Builder.CreateStore(Reduced, LHSPtr);
3748 }
3750
3751 Builder.SetInsertPoint(ContinuationBlock);
3752 return Builder.saveIP();
3753}
3754
3757 BodyGenCallbackTy BodyGenCB,
3758 FinalizeCallbackTy FiniCB) {
3759
3760 if (!updateToLocation(Loc))
3761 return Loc.IP;
3762
3763 Directive OMPD = Directive::OMPD_master;
3764 uint32_t SrcLocStrSize;
3765 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3766 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3767 Value *ThreadId = getOrCreateThreadID(Ident);
3768 Value *Args[] = {Ident, ThreadId};
3769
3770 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3771 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3772
3773 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3774 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3775
3776 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3777 /*Conditional*/ true, /*hasFinalize*/ true);
3778}
3779
3782 BodyGenCallbackTy BodyGenCB,
3783 FinalizeCallbackTy FiniCB, Value *Filter) {
3784 if (!updateToLocation(Loc))
3785 return Loc.IP;
3786
3787 Directive OMPD = Directive::OMPD_masked;
3788 uint32_t SrcLocStrSize;
3789 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3790 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3791 Value *ThreadId = getOrCreateThreadID(Ident);
3792 Value *Args[] = {Ident, ThreadId, Filter};
3793 Value *ArgsEnd[] = {Ident, ThreadId};
3794
3795 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3796 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3797
3798 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3799 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3800
3801 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3802 /*Conditional*/ true, /*hasFinalize*/ true);
3803}
3804
3806 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3807 BasicBlock *PostInsertBefore, const Twine &Name) {
3808 Module *M = F->getParent();
3809 LLVMContext &Ctx = M->getContext();
3810 Type *IndVarTy = TripCount->getType();
3811
3812 // Create the basic block structure.
3813 BasicBlock *Preheader =
3814 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3815 BasicBlock *Header =
3816 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3817 BasicBlock *Cond =
3818 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3819 BasicBlock *Body =
3820 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3821 BasicBlock *Latch =
3822 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3823 BasicBlock *Exit =
3824 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3825 BasicBlock *After =
3826 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3827
3828 // Use specified DebugLoc for new instructions.
3830
3831 Builder.SetInsertPoint(Preheader);
3832 Builder.CreateBr(Header);
3833
3834 Builder.SetInsertPoint(Header);
3835 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3836 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3838
3840 Value *Cmp =
3841 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3842 Builder.CreateCondBr(Cmp, Body, Exit);
3843
3844 Builder.SetInsertPoint(Body);
3845 Builder.CreateBr(Latch);
3846
3847 Builder.SetInsertPoint(Latch);
3848 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3849 "omp_" + Name + ".next", /*HasNUW=*/true);
3850 Builder.CreateBr(Header);
3851 IndVarPHI->addIncoming(Next, Latch);
3852
3853 Builder.SetInsertPoint(Exit);
3855
3856 // Remember and return the canonical control flow.
3857 LoopInfos.emplace_front();
3858 CanonicalLoopInfo *CL = &LoopInfos.front();
3859
3860 CL->Header = Header;
3861 CL->Cond = Cond;
3862 CL->Latch = Latch;
3863 CL->Exit = Exit;
3864
3865#ifndef NDEBUG
3866 CL->assertOK();
3867#endif
3868 return CL;
3869}
3870
3873 LoopBodyGenCallbackTy BodyGenCB,
3874 Value *TripCount, const Twine &Name) {
3875 BasicBlock *BB = Loc.IP.getBlock();
3876 BasicBlock *NextBB = BB->getNextNode();
3877
3878 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3879 NextBB, NextBB, Name);
3880 BasicBlock *After = CL->getAfter();
3881
3882 // If location is not set, don't connect the loop.
3883 if (updateToLocation(Loc)) {
3884 // Split the loop at the insertion point: Branch to the preheader and move
3885 // every following instruction to after the loop (the After BB). Also, the
3886 // new successor is the loop's after block.
3887 spliceBB(Builder, After, /*CreateBranch=*/false);
3889 }
3890
3891 // Emit the body content. We do it after connecting the loop to the CFG to
3892 // avoid that the callback encounters degenerate BBs.
3893 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
3894
3895#ifndef NDEBUG
3896 CL->assertOK();
3897#endif
3898 return CL;
3899}
3900
3902 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3903 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3904 InsertPointTy ComputeIP, const Twine &Name) {
3905
3906 // Consider the following difficulties (assuming 8-bit signed integers):
3907 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3908 // DO I = 1, 100, 50
3909 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3910 // DO I = 100, 0, -128
3911
3912 // Start, Stop and Step must be of the same integer type.
3913 auto *IndVarTy = cast<IntegerType>(Start->getType());
3914 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
3915 assert(IndVarTy == Step->getType() && "Step type mismatch");
3916
3917 LocationDescription ComputeLoc =
3918 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
3919 updateToLocation(ComputeLoc);
3920
3921 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
3922 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
3923
3924 // Like Step, but always positive.
3925 Value *Incr = Step;
3926
3927 // Distance between Start and Stop; always positive.
3928 Value *Span;
3929
3930 // Condition whether there are no iterations are executed at all, e.g. because
3931 // UB < LB.
3932 Value *ZeroCmp;
3933
3934 if (IsSigned) {
3935 // Ensure that increment is positive. If not, negate and invert LB and UB.
3936 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
3937 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
3938 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
3939 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
3940 Span = Builder.CreateSub(UB, LB, "", false, true);
3941 ZeroCmp = Builder.CreateICmp(
3942 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
3943 } else {
3944 Span = Builder.CreateSub(Stop, Start, "", true);
3945 ZeroCmp = Builder.CreateICmp(
3946 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
3947 }
3948
3949 Value *CountIfLooping;
3950 if (InclusiveStop) {
3951 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
3952 } else {
3953 // Avoid incrementing past stop since it could overflow.
3954 Value *CountIfTwo = Builder.CreateAdd(
3955 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
3956 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
3957 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
3958 }
3959 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
3960 "omp_" + Name + ".tripcount");
3961
3962 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
3963 Builder.restoreIP(CodeGenIP);
3964 Value *Span = Builder.CreateMul(IV, Step);
3965 Value *IndVar = Builder.CreateAdd(Span, Start);
3966 BodyGenCB(Builder.saveIP(), IndVar);
3967 };
3968 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
3969 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
3970}
3971
3972// Returns an LLVM function to call for initializing loop bounds using OpenMP
3973// static scheduling depending on `type`. Only i32 and i64 are supported by the
3974// runtime. Always interpret integers as unsigned similarly to
3975// CanonicalLoopInfo.
3977 OpenMPIRBuilder &OMPBuilder) {
3978 unsigned Bitwidth = Ty->getIntegerBitWidth();
3979 if (Bitwidth == 32)
3980 return OMPBuilder.getOrCreateRuntimeFunction(
3981 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
3982 if (Bitwidth == 64)
3983 return OMPBuilder.getOrCreateRuntimeFunction(
3984 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
3985 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3986}
3987
3989OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
3990 InsertPointTy AllocaIP,
3991 bool NeedsBarrier) {
3992 assert(CLI->isValid() && "Requires a valid canonical loop");
3993 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3994 "Require dedicated allocate IP");
3995
3996 // Set up the source location value for OpenMP runtime.
3999
4000 uint32_t SrcLocStrSize;
4001 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4002 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4003
4004 // Declare useful OpenMP runtime functions.
4005 Value *IV = CLI->getIndVar();
4006 Type *IVTy = IV->getType();
4007 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4008 FunctionCallee StaticFini =
4009 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4010
4011 // Allocate space for computed loop bounds as expected by the "init" function.
4012 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4013
4014 Type *I32Type = Type::getInt32Ty(M.getContext());
4015 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4016 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4017 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4018 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4019
4020 // At the end of the preheader, prepare for calling the "init" function by
4021 // storing the current loop bounds into the allocated space. A canonical loop
4022 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4023 // and produces an inclusive upper bound.
4025 Constant *Zero = ConstantInt::get(IVTy, 0);
4026 Constant *One = ConstantInt::get(IVTy, 1);
4027 Builder.CreateStore(Zero, PLowerBound);
4028 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4029 Builder.CreateStore(UpperBound, PUpperBound);
4030 Builder.CreateStore(One, PStride);
4031
4032 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4033
4034 Constant *SchedulingType = ConstantInt::get(
4035 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4036
4037 // Call the "init" function and update the trip count of the loop with the
4038 // value it produced.
4039 Builder.CreateCall(StaticInit,
4040 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4041 PUpperBound, PStride, One, Zero});
4042 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4043 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4044 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4045 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4046 CLI->setTripCount(TripCount);
4047
4048 // Update all uses of the induction variable except the one in the condition
4049 // block that compares it with the actual upper bound, and the increment in
4050 // the latch block.
4051
4052 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4054 CLI->getBody()->getFirstInsertionPt());
4056 return Builder.CreateAdd(OldIV, LowerBound);
4057 });
4058
4059 // In the "exit" block, call the "fini" function.
4061 CLI->getExit()->getTerminator()->getIterator());
4062 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4063
4064 // Add the barrier if requested.
4065 if (NeedsBarrier)
4066 createBarrier(LocationDescription(Builder.saveIP(), DL),
4067 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4068 /* CheckCancelFlag */ false);
4069
4070 InsertPointTy AfterIP = CLI->getAfterIP();
4071 CLI->invalidate();
4072
4073 return AfterIP;
4074}
4075
4076OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4077 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4078 bool NeedsBarrier, Value *ChunkSize) {
4079 assert(CLI->isValid() && "Requires a valid canonical loop");
4080 assert(ChunkSize && "Chunk size is required");
4081
4082 LLVMContext &Ctx = CLI->getFunction()->getContext();
4083 Value *IV = CLI->getIndVar();
4084 Value *OrigTripCount = CLI->getTripCount();
4085 Type *IVTy = IV->getType();
4086 assert(IVTy->getIntegerBitWidth() <= 64 &&
4087 "Max supported tripcount bitwidth is 64 bits");
4088 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4089 : Type::getInt64Ty(Ctx);
4090 Type *I32Type = Type::getInt32Ty(M.getContext());
4091 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4092 Constant *One = ConstantInt::get(InternalIVTy, 1);
4093
4094 // Declare useful OpenMP runtime functions.
4095 FunctionCallee StaticInit =
4096 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4097 FunctionCallee StaticFini =
4098 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4099
4100 // Allocate space for computed loop bounds as expected by the "init" function.
4101 Builder.restoreIP(AllocaIP);
4103 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4104 Value *PLowerBound =
4105 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4106 Value *PUpperBound =
4107 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4108 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4109
4110 // Set up the source location value for the OpenMP runtime.
4113
4114 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4115 Value *CastedChunkSize =
4116 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4117 Value *CastedTripCount =
4118 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4119
4120 Constant *SchedulingType = ConstantInt::get(
4121 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4122 Builder.CreateStore(Zero, PLowerBound);
4123 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4124 Builder.CreateStore(OrigUpperBound, PUpperBound);
4125 Builder.CreateStore(One, PStride);
4126
4127 // Call the "init" function and update the trip count of the loop with the
4128 // value it produced.
4129 uint32_t SrcLocStrSize;
4130 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4131 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4132 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4133 Builder.CreateCall(StaticInit,
4134 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4135 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4136 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4137 /*pstride=*/PStride, /*incr=*/One,
4138 /*chunk=*/CastedChunkSize});
4139
4140 // Load values written by the "init" function.
4141 Value *FirstChunkStart =
4142 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4143 Value *FirstChunkStop =
4144 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4145 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4146 Value *ChunkRange =
4147 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4148 Value *NextChunkStride =
4149 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4150
4151 // Create outer "dispatch" loop for enumerating the chunks.
4152 BasicBlock *DispatchEnter = splitBB(Builder, true);
4153 Value *DispatchCounter;
4155 {Builder.saveIP(), DL},
4156 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
4157 FirstChunkStart, CastedTripCount, NextChunkStride,
4158 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4159 "dispatch");
4160
4161 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4162 // not have to preserve the canonical invariant.
4163 BasicBlock *DispatchBody = DispatchCLI->getBody();
4164 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4165 BasicBlock *DispatchExit = DispatchCLI->getExit();
4166 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4167 DispatchCLI->invalidate();
4168
4169 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4170 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4171 redirectTo(CLI->getExit(), DispatchLatch, DL);
4172 redirectTo(DispatchBody, DispatchEnter, DL);
4173
4174 // Prepare the prolog of the chunk loop.
4177
4178 // Compute the number of iterations of the chunk loop.
4180 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4181 Value *IsLastChunk =
4182 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4183 Value *CountUntilOrigTripCount =
4184 Builder.CreateSub(CastedTripCount, DispatchCounter);
4185 Value *ChunkTripCount = Builder.CreateSelect(
4186 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4187 Value *BackcastedChunkTC =
4188 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4189 CLI->setTripCount(BackcastedChunkTC);
4190
4191 // Update all uses of the induction variable except the one in the condition
4192 // block that compares it with the actual upper bound, and the increment in
4193 // the latch block.
4194 Value *BackcastedDispatchCounter =
4195 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4196 CLI->mapIndVar([&](Instruction *) -> Value * {
4197 Builder.restoreIP(CLI->getBodyIP());
4198 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4199 });
4200
4201 // In the "exit" block, call the "fini" function.
4202 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4203 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4204
4205 // Add the barrier if requested.
4206 if (NeedsBarrier)
4207 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4208 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4209
4210#ifndef NDEBUG
4211 // Even though we currently do not support applying additional methods to it,