LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
64
65#include <cstdint>
66#include <optional>
67
68#define DEBUG_TYPE "openmp-ir-builder"
69
70using namespace llvm;
71using namespace omp;
72
73static cl::opt<bool>
74 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
75 cl::desc("Use optimistic attributes describing "
76 "'as-if' properties of runtime calls."),
77 cl::init(false));
78
80 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
81 cl::desc("Factor for the unroll threshold to account for code "
82 "simplifications still taking place"),
83 cl::init(1.5));
84
85#ifndef NDEBUG
86/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
87/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
88/// an InsertPoint stores the instruction before something is inserted. For
89/// instance, if both point to the same instruction, two IRBuilders alternating
90/// creating instruction will cause the instructions to be interleaved.
93 if (!IP1.isSet() || !IP2.isSet())
94 return false;
95 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
96}
97
99 // Valid ordered/unordered and base algorithm combinations.
100 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
101 case OMPScheduleType::UnorderedStaticChunked:
102 case OMPScheduleType::UnorderedStatic:
103 case OMPScheduleType::UnorderedDynamicChunked:
104 case OMPScheduleType::UnorderedGuidedChunked:
105 case OMPScheduleType::UnorderedRuntime:
106 case OMPScheduleType::UnorderedAuto:
107 case OMPScheduleType::UnorderedTrapezoidal:
108 case OMPScheduleType::UnorderedGreedy:
109 case OMPScheduleType::UnorderedBalanced:
110 case OMPScheduleType::UnorderedGuidedIterativeChunked:
111 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
112 case OMPScheduleType::UnorderedSteal:
113 case OMPScheduleType::UnorderedStaticBalancedChunked:
114 case OMPScheduleType::UnorderedGuidedSimd:
115 case OMPScheduleType::UnorderedRuntimeSimd:
116 case OMPScheduleType::OrderedStaticChunked:
117 case OMPScheduleType::OrderedStatic:
118 case OMPScheduleType::OrderedDynamicChunked:
119 case OMPScheduleType::OrderedGuidedChunked:
120 case OMPScheduleType::OrderedRuntime:
121 case OMPScheduleType::OrderedAuto:
122 case OMPScheduleType::OrderdTrapezoidal:
123 case OMPScheduleType::NomergeUnorderedStaticChunked:
124 case OMPScheduleType::NomergeUnorderedStatic:
125 case OMPScheduleType::NomergeUnorderedDynamicChunked:
126 case OMPScheduleType::NomergeUnorderedGuidedChunked:
127 case OMPScheduleType::NomergeUnorderedRuntime:
128 case OMPScheduleType::NomergeUnorderedAuto:
129 case OMPScheduleType::NomergeUnorderedTrapezoidal:
130 case OMPScheduleType::NomergeUnorderedGreedy:
131 case OMPScheduleType::NomergeUnorderedBalanced:
132 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
133 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
134 case OMPScheduleType::NomergeUnorderedSteal:
135 case OMPScheduleType::NomergeOrderedStaticChunked:
136 case OMPScheduleType::NomergeOrderedStatic:
137 case OMPScheduleType::NomergeOrderedDynamicChunked:
138 case OMPScheduleType::NomergeOrderedGuidedChunked:
139 case OMPScheduleType::NomergeOrderedRuntime:
140 case OMPScheduleType::NomergeOrderedAuto:
141 case OMPScheduleType::NomergeOrderedTrapezoidal:
142 case OMPScheduleType::OrderedDistributeChunked:
143 case OMPScheduleType::OrderedDistribute:
144 break;
145 default:
146 return false;
147 }
148
149 // Must not set both monotonicity modifiers at the same time.
150 OMPScheduleType MonotonicityFlags =
151 SchedType & OMPScheduleType::MonotonicityMask;
152 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
153 return false;
154
155 return true;
156}
157#endif
158
159/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
160/// debug location to the last instruction in the specified basic block if the
161/// insert point points to the end of the block.
164 Builder.restoreIP(IP);
165 llvm::BasicBlock *BB = Builder.GetInsertBlock();
166 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
167 if (!BB->empty() && I == BB->end())
168 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
169}
170
171static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
172 if (T.isAMDGPU()) {
173 StringRef Features =
174 Kernel->getFnAttribute("target-features").getValueAsString();
175 if (Features.count("+wavefrontsize64"))
178 }
179 if (T.isNVPTX())
181 if (T.isSPIRV())
183 llvm_unreachable("No grid value available for this architecture!");
184}
185
186/// Determine which scheduling algorithm to use, determined from schedule clause
187/// arguments.
188static OMPScheduleType
189getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
190 bool HasSimdModifier, bool HasDistScheduleChunks) {
191 // Currently, the default schedule it static.
192 switch (ClauseKind) {
193 case OMP_SCHEDULE_Default:
194 case OMP_SCHEDULE_Static:
195 return HasChunks ? OMPScheduleType::BaseStaticChunked
196 : OMPScheduleType::BaseStatic;
197 case OMP_SCHEDULE_Dynamic:
198 return OMPScheduleType::BaseDynamicChunked;
199 case OMP_SCHEDULE_Guided:
200 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
201 : OMPScheduleType::BaseGuidedChunked;
202 case OMP_SCHEDULE_Auto:
204 case OMP_SCHEDULE_Runtime:
205 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
206 : OMPScheduleType::BaseRuntime;
207 case OMP_SCHEDULE_Distribute:
208 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
209 : OMPScheduleType::BaseDistribute;
210 }
211 llvm_unreachable("unhandled schedule clause argument");
212}
213
214/// Adds ordering modifier flags to schedule type.
215static OMPScheduleType
217 bool HasOrderedClause) {
218 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
219 OMPScheduleType::None &&
220 "Must not have ordering nor monotonicity flags already set");
221
222 OMPScheduleType OrderingModifier = HasOrderedClause
223 ? OMPScheduleType::ModifierOrdered
224 : OMPScheduleType::ModifierUnordered;
225 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
226
227 // Unsupported combinations
228 if (OrderingScheduleType ==
229 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
230 return OMPScheduleType::OrderedGuidedChunked;
231 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
232 OMPScheduleType::ModifierOrdered))
233 return OMPScheduleType::OrderedRuntime;
234
235 return OrderingScheduleType;
236}
237
238/// Adds monotonicity modifier flags to schedule type.
239static OMPScheduleType
241 bool HasSimdModifier, bool HasMonotonic,
242 bool HasNonmonotonic, bool HasOrderedClause) {
243 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
244 OMPScheduleType::None &&
245 "Must not have monotonicity flags already set");
246 assert((!HasMonotonic || !HasNonmonotonic) &&
247 "Monotonic and Nonmonotonic are contradicting each other");
248
249 if (HasMonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierMonotonic;
251 } else if (HasNonmonotonic) {
252 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
253 } else {
254 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
255 // If the static schedule kind is specified or if the ordered clause is
256 // specified, and if the nonmonotonic modifier is not specified, the
257 // effect is as if the monotonic modifier is specified. Otherwise, unless
258 // the monotonic modifier is specified, the effect is as if the
259 // nonmonotonic modifier is specified.
260 OMPScheduleType BaseScheduleType =
261 ScheduleType & ~OMPScheduleType::ModifierMask;
262 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
263 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
264 HasOrderedClause) {
265 // The monotonic is used by default in openmp runtime library, so no need
266 // to set it.
267 return ScheduleType;
268 } else {
269 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
270 }
271 }
272}
273
274/// Determine the schedule type using schedule and ordering clause arguments.
275static OMPScheduleType
276computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
277 bool HasSimdModifier, bool HasMonotonicModifier,
278 bool HasNonmonotonicModifier, bool HasOrderedClause,
279 bool HasDistScheduleChunks) {
281 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
282 OMPScheduleType OrderedSchedule =
283 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
285 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
286 HasNonmonotonicModifier, HasOrderedClause);
287
289 return Result;
290}
291
292/// Make \p Source branch to \p Target.
293///
294/// Handles two situations:
295/// * \p Source already has an unconditional branch.
296/// * \p Source is a degenerate block (no terminator because the BB is
297/// the current head of the IR construction).
299 if (Instruction *Term = Source->getTerminator()) {
300 auto *Br = cast<BranchInst>(Term);
301 assert(!Br->isConditional() &&
302 "BB's terminator must be an unconditional branch (or degenerate)");
303 BasicBlock *Succ = Br->getSuccessor(0);
304 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
305 Br->setSuccessor(0, Target);
306 return;
307 }
308
309 auto *NewBr = BranchInst::Create(Target, Source);
310 NewBr->setDebugLoc(DL);
311}
312
314 bool CreateBranch, DebugLoc DL) {
315 assert(New->getFirstInsertionPt() == New->begin() &&
316 "Target BB must not have PHI nodes");
317
318 // Move instructions to new block.
319 BasicBlock *Old = IP.getBlock();
320 // If the `Old` block is empty then there are no instructions to move. But in
321 // the new debug scheme, it could have trailing debug records which will be
322 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
323 // reasons:
324 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
325 // 2. Even if `New` is not empty, the rationale to move those records to `New`
326 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
327 // assumes that `Old` is optimized out and is going away. This is not the case
328 // here. The `Old` block is still being used e.g. a branch instruction is
329 // added to it later in this function.
330 // So we call `BasicBlock::splice` only when `Old` is not empty.
331 if (!Old->empty())
332 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
333
334 if (CreateBranch) {
335 auto *NewBr = BranchInst::Create(New, Old);
336 NewBr->setDebugLoc(DL);
337 }
338}
339
340void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
341 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
342 BasicBlock *Old = Builder.GetInsertBlock();
343
344 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Old->getTerminator());
347 else
348 Builder.SetInsertPoint(Old);
349
350 // SetInsertPoint also updates the Builder's debug location, but we want to
351 // keep the one the Builder was configured to use.
352 Builder.SetCurrentDebugLocation(DebugLoc);
353}
354
356 DebugLoc DL, llvm::Twine Name) {
357 BasicBlock *Old = IP.getBlock();
359 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
360 Old->getParent(), Old->getNextNode());
361 spliceBB(IP, New, CreateBranch, DL);
362 New->replaceSuccessorsPhiUsesWith(Old, New);
363 return New;
364}
365
366BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
367 llvm::Twine Name) {
368 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
369 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
370 if (CreateBranch)
371 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
372 else
373 Builder.SetInsertPoint(Builder.GetInsertBlock());
374 // SetInsertPoint also updates the Builder's debug location, but we want to
375 // keep the one the Builder was configured to use.
376 Builder.SetCurrentDebugLocation(DebugLoc);
377 return New;
378}
379
380BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
381 llvm::Twine Name) {
382 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
383 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
384 if (CreateBranch)
385 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
386 else
387 Builder.SetInsertPoint(Builder.GetInsertBlock());
388 // SetInsertPoint also updates the Builder's debug location, but we want to
389 // keep the one the Builder was configured to use.
390 Builder.SetCurrentDebugLocation(DebugLoc);
391 return New;
392}
393
395 llvm::Twine Suffix) {
396 BasicBlock *Old = Builder.GetInsertBlock();
397 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
398}
399
400// This function creates a fake integer value and a fake use for the integer
401// value. It returns the fake value created. This is useful in modeling the
402// extra arguments to the outlined functions.
404 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
406 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
407 const Twine &Name = "", bool AsPtr = true,
408 bool Is64Bit = false) {
409 Builder.restoreIP(OuterAllocaIP);
410 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
411 Instruction *FakeVal;
412 AllocaInst *FakeValAddr =
413 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
414 ToBeDeleted.push_back(FakeValAddr);
415
416 if (AsPtr) {
417 FakeVal = FakeValAddr;
418 } else {
419 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
420 ToBeDeleted.push_back(FakeVal);
421 }
422
423 // Generate a fake use of this value
424 Builder.restoreIP(InnerAllocaIP);
425 Instruction *UseFakeVal;
426 if (AsPtr) {
427 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
428 } else {
429 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
430 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
431 }
432 ToBeDeleted.push_back(UseFakeVal);
433 return FakeVal;
434}
435
436//===----------------------------------------------------------------------===//
437// OpenMPIRBuilderConfig
438//===----------------------------------------------------------------------===//
439
440namespace {
442/// Values for bit flags for marking which requires clauses have been used.
443enum OpenMPOffloadingRequiresDirFlags {
444 /// flag undefined.
445 OMP_REQ_UNDEFINED = 0x000,
446 /// no requires directive present.
447 OMP_REQ_NONE = 0x001,
448 /// reverse_offload clause.
449 OMP_REQ_REVERSE_OFFLOAD = 0x002,
450 /// unified_address clause.
451 OMP_REQ_UNIFIED_ADDRESS = 0x004,
452 /// unified_shared_memory clause.
453 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
454 /// dynamic_allocators clause.
455 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
456 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
457};
458
459} // anonymous namespace
460
462 : RequiresFlags(OMP_REQ_UNDEFINED) {}
463
466 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
467 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
470 RequiresFlags(OMP_REQ_UNDEFINED) {
471 if (HasRequiresReverseOffload)
472 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
473 if (HasRequiresUnifiedAddress)
474 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
475 if (HasRequiresUnifiedSharedMemory)
476 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
477 if (HasRequiresDynamicAllocators)
478 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
479}
480
482 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
483}
484
486 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
487}
488
490 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
491}
492
494 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
495}
496
498 return hasRequiresFlags() ? RequiresFlags
499 : static_cast<int64_t>(OMP_REQ_NONE);
500}
501
503 if (Value)
504 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
505 else
506 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
507}
508
510 if (Value)
511 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
512 else
513 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
514}
515
517 if (Value)
518 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
519 else
520 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
521}
522
524 if (Value)
525 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
526 else
527 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
528}
529
530//===----------------------------------------------------------------------===//
531// OpenMPIRBuilder
532//===----------------------------------------------------------------------===//
533
536 SmallVector<Value *> &ArgsVector) {
538 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
539 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
540 constexpr size_t MaxDim = 3;
541 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
542
543 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
544
545 Value *DynCGroupMemFallbackFlag =
546 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
547 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
548 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
549
550 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
551
552 Value *NumTeams3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
554 Value *NumThreads3D =
555 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
556 for (unsigned I :
557 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
558 NumTeams3D =
559 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
560 for (unsigned I :
561 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
562 NumThreads3D =
563 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
564
565 ArgsVector = {Version,
566 PointerNum,
567 KernelArgs.RTArgs.BasePointersArray,
568 KernelArgs.RTArgs.PointersArray,
569 KernelArgs.RTArgs.SizesArray,
570 KernelArgs.RTArgs.MapTypesArray,
571 KernelArgs.RTArgs.MapNamesArray,
572 KernelArgs.RTArgs.MappersArray,
573 KernelArgs.NumIterations,
574 Flags,
575 NumTeams3D,
576 NumThreads3D,
577 KernelArgs.DynCGroupMem};
578}
579
581 LLVMContext &Ctx = Fn.getContext();
582
583 // Get the function's current attributes.
584 auto Attrs = Fn.getAttributes();
585 auto FnAttrs = Attrs.getFnAttrs();
586 auto RetAttrs = Attrs.getRetAttrs();
588 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
589 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
590
591 // Add AS to FnAS while taking special care with integer extensions.
592 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
593 bool Param = true) -> void {
594 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
595 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
596 if (HasSignExt || HasZeroExt) {
597 assert(AS.getNumAttributes() == 1 &&
598 "Currently not handling extension attr combined with others.");
599 if (Param) {
600 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
601 FnAS = FnAS.addAttribute(Ctx, AK);
602 } else if (auto AK =
603 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else {
606 FnAS = FnAS.addAttributes(Ctx, AS);
607 }
608 };
609
610#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
611#include "llvm/Frontend/OpenMP/OMPKinds.def"
612
613 // Add attributes to the function declaration.
614 switch (FnID) {
615#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
616 case Enum: \
617 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
618 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
619 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
620 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
621 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
622 break;
623#include "llvm/Frontend/OpenMP/OMPKinds.def"
624 default:
625 // Attributes are optional.
626 break;
627 }
628}
629
632 FunctionType *FnTy = nullptr;
633 Function *Fn = nullptr;
634
635 // Try to find the declation in the module first.
636 switch (FnID) {
637#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
638 case Enum: \
639 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
640 IsVarArg); \
641 Fn = M.getFunction(Str); \
642 break;
643#include "llvm/Frontend/OpenMP/OMPKinds.def"
644 }
645
646 if (!Fn) {
647 // Create a new declaration if we need one.
648 switch (FnID) {
649#define OMP_RTL(Enum, Str, ...) \
650 case Enum: \
651 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
652 break;
653#include "llvm/Frontend/OpenMP/OMPKinds.def"
654 }
655 Fn->setCallingConv(Config.getRuntimeCC());
656 // Add information if the runtime function takes a callback function
657 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
658 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
659 LLVMContext &Ctx = Fn->getContext();
660 MDBuilder MDB(Ctx);
661 // Annotate the callback behavior of the runtime function:
662 // - The callback callee is argument number 2 (microtask).
663 // - The first two arguments of the callback callee are unknown (-1).
664 // - All variadic arguments to the runtime function are passed to the
665 // callback callee.
666 Fn->addMetadata(
667 LLVMContext::MD_callback,
669 2, {-1, -1}, /* VarArgsArePassed */ true)}));
670 }
671 }
672
673 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
674 << " with type " << *Fn->getFunctionType() << "\n");
675 addAttributes(FnID, *Fn);
676
677 } else {
678 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
679 << " with type " << *Fn->getFunctionType() << "\n");
680 }
681
682 assert(Fn && "Failed to create OpenMP runtime function");
683
684 return {FnTy, Fn};
685}
686
689 if (!FiniBB) {
690 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
692 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
693 Builder.SetInsertPoint(FiniBB);
694 // FiniCB adds the branch to the exit stub.
695 if (Error Err = FiniCB(Builder.saveIP()))
696 return Err;
697 }
698 return FiniBB;
699}
700
702 BasicBlock *OtherFiniBB) {
703 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
704 if (!FiniBB) {
705 FiniBB = OtherFiniBB;
706
707 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
708 if (Error Err = FiniCB(Builder.saveIP()))
709 return Err;
710
711 return Error::success();
712 }
713
714 // Move instructions from FiniBB to the start of OtherFiniBB.
715 auto EndIt = FiniBB->end();
716 if (FiniBB->size() >= 1)
717 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
718 EndIt = Prev;
719 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
720 EndIt);
721
722 FiniBB->replaceAllUsesWith(OtherFiniBB);
723 FiniBB->eraseFromParent();
724 FiniBB = OtherFiniBB;
725 return Error::success();
726}
727
730 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
731 assert(Fn && "Failed to create OpenMP runtime function pointer");
732 return Fn;
733}
734
737 StringRef Name) {
738 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
739 Call->setCallingConv(Config.getRuntimeCC());
740 return Call;
741}
742
743void OpenMPIRBuilder::initialize() { initializeTypes(M); }
744
747 BasicBlock &EntryBlock = Function->getEntryBlock();
748 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
749
750 // Loop over blocks looking for constant allocas, skipping the entry block
751 // as any allocas there are already in the desired location.
752 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
753 Block++) {
754 for (auto Inst = Block->getReverseIterator()->begin();
755 Inst != Block->getReverseIterator()->end();) {
757 Inst++;
759 continue;
760 AllocaInst->moveBeforePreserving(MoveLocInst);
761 } else {
762 Inst++;
763 }
764 }
765 }
766}
767
770
771 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
772 // TODO: For now, we support simple static allocations, we might need to
773 // move non-static ones as well. However, this will need further analysis to
774 // move the lenght arguments as well.
776 };
777
778 for (llvm::Instruction &Inst : Block)
780 if (ShouldHoistAlloca(*AllocaInst))
781 AllocasToMove.push_back(AllocaInst);
782
783 auto InsertPoint =
784 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
785
786 for (llvm::Instruction *AllocaInst : AllocasToMove)
788}
789
791 PostDominatorTree PostDomTree(*Func);
792 for (llvm::BasicBlock &BB : *Func)
793 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
795}
796
798 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
800 SmallVector<OutlineInfo, 16> DeferredOutlines;
801 for (OutlineInfo &OI : OutlineInfos) {
802 // Skip functions that have not finalized yet; may happen with nested
803 // function generation.
804 if (Fn && OI.getFunction() != Fn) {
805 DeferredOutlines.push_back(OI);
806 continue;
807 }
808
809 ParallelRegionBlockSet.clear();
810 Blocks.clear();
811 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
812
813 Function *OuterFn = OI.getFunction();
814 CodeExtractorAnalysisCache CEAC(*OuterFn);
815 // If we generate code for the target device, we need to allocate
816 // struct for aggregate params in the device default alloca address space.
817 // OpenMP runtime requires that the params of the extracted functions are
818 // passed as zero address space pointers. This flag ensures that
819 // CodeExtractor generates correct code for extracted functions
820 // which are used by OpenMP runtime.
821 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
822 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
823 /* AggregateArgs */ true,
824 /* BlockFrequencyInfo */ nullptr,
825 /* BranchProbabilityInfo */ nullptr,
826 /* AssumptionCache */ nullptr,
827 /* AllowVarArgs */ true,
828 /* AllowAlloca */ true,
829 /* AllocaBlock*/ OI.OuterAllocaBB,
830 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
831
832 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
833 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
834 << " Exit: " << OI.ExitBB->getName() << "\n");
835 assert(Extractor.isEligible() &&
836 "Expected OpenMP outlining to be possible!");
837
838 for (auto *V : OI.ExcludeArgsFromAggregate)
839 Extractor.excludeArgFromAggregate(V);
840
841 Function *OutlinedFn =
842 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
843
844 // Forward target-cpu, target-features attributes to the outlined function.
845 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
846 if (TargetCpuAttr.isStringAttribute())
847 OutlinedFn->addFnAttr(TargetCpuAttr);
848
849 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
850 if (TargetFeaturesAttr.isStringAttribute())
851 OutlinedFn->addFnAttr(TargetFeaturesAttr);
852
853 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
854 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
855 assert(OutlinedFn->getReturnType()->isVoidTy() &&
856 "OpenMP outlined functions should not return a value!");
857
858 // For compability with the clang CG we move the outlined function after the
859 // one with the parallel region.
860 OutlinedFn->removeFromParent();
861 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
862
863 // Remove the artificial entry introduced by the extractor right away, we
864 // made our own entry block after all.
865 {
866 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
867 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
868 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
869 // Move instructions from the to-be-deleted ArtificialEntry to the entry
870 // basic block of the parallel region. CodeExtractor generates
871 // instructions to unwrap the aggregate argument and may sink
872 // allocas/bitcasts for values that are solely used in the outlined region
873 // and do not escape.
874 assert(!ArtificialEntry.empty() &&
875 "Expected instructions to add in the outlined region entry");
876 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
877 End = ArtificialEntry.rend();
878 It != End;) {
879 Instruction &I = *It;
880 It++;
881
882 if (I.isTerminator()) {
883 // Absorb any debug value that terminator may have
884 if (OI.EntryBB->getTerminator())
885 OI.EntryBB->getTerminator()->adoptDbgRecords(
886 &ArtificialEntry, I.getIterator(), false);
887 continue;
888 }
889
890 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
891 }
892
893 OI.EntryBB->moveBefore(&ArtificialEntry);
894 ArtificialEntry.eraseFromParent();
895 }
896 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
897 assert(OutlinedFn && OutlinedFn->hasNUses(1));
898
899 // Run a user callback, e.g. to add attributes.
900 if (OI.PostOutlineCB)
901 OI.PostOutlineCB(*OutlinedFn);
902
903 if (OI.FixUpNonEntryAllocas)
905 }
906
907 // Remove work items that have been completed.
908 OutlineInfos = std::move(DeferredOutlines);
909
910 // The createTarget functions embeds user written code into
911 // the target region which may inject allocas which need to
912 // be moved to the entry block of our target or risk malformed
913 // optimisations by later passes, this is only relevant for
914 // the device pass which appears to be a little more delicate
915 // when it comes to optimisations (however, we do not block on
916 // that here, it's up to the inserter to the list to do so).
917 // This notbaly has to occur after the OutlinedInfo candidates
918 // have been extracted so we have an end product that will not
919 // be implicitly adversely affected by any raises unless
920 // intentionally appended to the list.
921 // NOTE: This only does so for ConstantData, it could be extended
922 // to ConstantExpr's with further effort, however, they should
923 // largely be folded when they get here. Extending it to runtime
924 // defined/read+writeable allocation sizes would be non-trivial
925 // (need to factor in movement of any stores to variables the
926 // allocation size depends on, as well as the usual loads,
927 // otherwise it'll yield the wrong result after movement) and
928 // likely be more suitable as an LLVM optimisation pass.
931
932 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
933 [](EmitMetadataErrorKind Kind,
934 const TargetRegionEntryInfo &EntryInfo) -> void {
935 errs() << "Error of kind: " << Kind
936 << " when emitting offload entries and metadata during "
937 "OMPIRBuilder finalization \n";
938 };
939
940 if (!OffloadInfoManager.empty())
942
943 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
944 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
945 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
946 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
947 }
948
949 IsFinalized = true;
950}
951
952bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
953
955 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
956}
957
959 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
960 auto *GV =
961 new GlobalVariable(M, I32Ty,
962 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
963 ConstantInt::get(I32Ty, Value), Name);
964 GV->setVisibility(GlobalValue::HiddenVisibility);
965
966 return GV;
967}
968
970 if (List.empty())
971 return;
972
973 // Convert List to what ConstantArray needs.
975 UsedArray.resize(List.size());
976 for (unsigned I = 0, E = List.size(); I != E; ++I)
978 cast<Constant>(&*List[I]), Builder.getPtrTy());
979
980 if (UsedArray.empty())
981 return;
982 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
983
984 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
985 ConstantArray::get(ATy, UsedArray), Name);
986
987 GV->setSection("llvm.metadata");
988}
989
992 OMPTgtExecModeFlags Mode) {
993 auto *Int8Ty = Builder.getInt8Ty();
994 auto *GVMode = new GlobalVariable(
995 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
996 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
997 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
998 return GVMode;
999}
1000
1002 uint32_t SrcLocStrSize,
1003 IdentFlag LocFlags,
1004 unsigned Reserve2Flags) {
1005 // Enable "C-mode".
1006 LocFlags |= OMP_IDENT_FLAG_KMPC;
1007
1008 Constant *&Ident =
1009 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1010 if (!Ident) {
1011 Constant *I32Null = ConstantInt::getNullValue(Int32);
1012 Constant *IdentData[] = {I32Null,
1013 ConstantInt::get(Int32, uint32_t(LocFlags)),
1014 ConstantInt::get(Int32, Reserve2Flags),
1015 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1016
1017 size_t SrcLocStrArgIdx = 4;
1018 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1020 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1021 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1022 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1023 Constant *Initializer =
1024 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1025
1026 // Look for existing encoding of the location + flags, not needed but
1027 // minimizes the difference to the existing solution while we transition.
1028 for (GlobalVariable &GV : M.globals())
1029 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1030 if (GV.getInitializer() == Initializer)
1031 Ident = &GV;
1032
1033 if (!Ident) {
1034 auto *GV = new GlobalVariable(
1035 M, OpenMPIRBuilder::Ident,
1036 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1038 M.getDataLayout().getDefaultGlobalsAddressSpace());
1039 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1040 GV->setAlignment(Align(8));
1041 Ident = GV;
1042 }
1043 }
1044
1045 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1046}
1047
1049 uint32_t &SrcLocStrSize) {
1050 SrcLocStrSize = LocStr.size();
1051 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1052 if (!SrcLocStr) {
1053 Constant *Initializer =
1054 ConstantDataArray::getString(M.getContext(), LocStr);
1055
1056 // Look for existing encoding of the location, not needed but minimizes the
1057 // difference to the existing solution while we transition.
1058 for (GlobalVariable &GV : M.globals())
1059 if (GV.isConstant() && GV.hasInitializer() &&
1060 GV.getInitializer() == Initializer)
1061 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1062
1063 SrcLocStr = Builder.CreateGlobalString(
1064 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1065 &M);
1066 }
1067 return SrcLocStr;
1068}
1069
1071 StringRef FileName,
1072 unsigned Line, unsigned Column,
1073 uint32_t &SrcLocStrSize) {
1074 SmallString<128> Buffer;
1075 Buffer.push_back(';');
1076 Buffer.append(FileName);
1077 Buffer.push_back(';');
1078 Buffer.append(FunctionName);
1079 Buffer.push_back(';');
1080 Buffer.append(std::to_string(Line));
1081 Buffer.push_back(';');
1082 Buffer.append(std::to_string(Column));
1083 Buffer.push_back(';');
1084 Buffer.push_back(';');
1085 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1086}
1087
1088Constant *
1090 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1091 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1092}
1093
1095 uint32_t &SrcLocStrSize,
1096 Function *F) {
1097 DILocation *DIL = DL.get();
1098 if (!DIL)
1099 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1100 StringRef FileName = M.getName();
1101 if (DIFile *DIF = DIL->getFile())
1102 if (std::optional<StringRef> Source = DIF->getSource())
1103 FileName = *Source;
1104 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1105 if (Function.empty() && F)
1106 Function = F->getName();
1107 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1108 DIL->getColumn(), SrcLocStrSize);
1109}
1110
1112 uint32_t &SrcLocStrSize) {
1113 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1114 Loc.IP.getBlock()->getParent());
1115}
1116
1119 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1120 "omp_global_thread_num");
1121}
1122
1125 bool ForceSimpleCall, bool CheckCancelFlag) {
1126 if (!updateToLocation(Loc))
1127 return Loc.IP;
1128
1129 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1130 // __kmpc_barrier(loc, thread_id);
1131
1132 IdentFlag BarrierLocFlags;
1133 switch (Kind) {
1134 case OMPD_for:
1135 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1136 break;
1137 case OMPD_sections:
1138 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1139 break;
1140 case OMPD_single:
1141 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1142 break;
1143 case OMPD_barrier:
1144 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1145 break;
1146 default:
1147 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1148 break;
1149 }
1150
1151 uint32_t SrcLocStrSize;
1152 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1153 Value *Args[] = {
1154 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1155 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1156
1157 // If we are in a cancellable parallel region, barriers are cancellation
1158 // points.
1159 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1160 bool UseCancelBarrier =
1161 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1162
1164 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1165 ? OMPRTL___kmpc_cancel_barrier
1166 : OMPRTL___kmpc_barrier),
1167 Args);
1168
1169 if (UseCancelBarrier && CheckCancelFlag)
1170 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1171 return Err;
1172
1173 return Builder.saveIP();
1174}
1175
1178 Value *IfCondition,
1179 omp::Directive CanceledDirective) {
1180 if (!updateToLocation(Loc))
1181 return Loc.IP;
1182
1183 // LLVM utilities like blocks with terminators.
1184 auto *UI = Builder.CreateUnreachable();
1185
1186 Instruction *ThenTI = UI, *ElseTI = nullptr;
1187 if (IfCondition) {
1188 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1189
1190 // Even if the if condition evaluates to false, this should count as a
1191 // cancellation point
1192 Builder.SetInsertPoint(ElseTI);
1193 auto ElseIP = Builder.saveIP();
1194
1196 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1197 if (!IPOrErr)
1198 return IPOrErr;
1199 }
1200
1201 Builder.SetInsertPoint(ThenTI);
1202
1203 Value *CancelKind = nullptr;
1204 switch (CanceledDirective) {
1205#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1206 case DirectiveEnum: \
1207 CancelKind = Builder.getInt32(Value); \
1208 break;
1209#include "llvm/Frontend/OpenMP/OMPKinds.def"
1210 default:
1211 llvm_unreachable("Unknown cancel kind!");
1212 }
1213
1214 uint32_t SrcLocStrSize;
1215 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1216 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1217 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1219 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1220
1221 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1222 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1223 return Err;
1224
1225 // Update the insertion point and remove the terminator we introduced.
1226 Builder.SetInsertPoint(UI->getParent());
1227 UI->eraseFromParent();
1228
1229 return Builder.saveIP();
1230}
1231
1234 omp::Directive CanceledDirective) {
1235 if (!updateToLocation(Loc))
1236 return Loc.IP;
1237
1238 // LLVM utilities like blocks with terminators.
1239 auto *UI = Builder.CreateUnreachable();
1240 Builder.SetInsertPoint(UI);
1241
1242 Value *CancelKind = nullptr;
1243 switch (CanceledDirective) {
1244#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1245 case DirectiveEnum: \
1246 CancelKind = Builder.getInt32(Value); \
1247 break;
1248#include "llvm/Frontend/OpenMP/OMPKinds.def"
1249 default:
1250 llvm_unreachable("Unknown cancel kind!");
1251 }
1252
1253 uint32_t SrcLocStrSize;
1254 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1255 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1256 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1258 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1259
1260 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1261 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1262 return Err;
1263
1264 // Update the insertion point and remove the terminator we introduced.
1265 Builder.SetInsertPoint(UI->getParent());
1266 UI->eraseFromParent();
1267
1268 return Builder.saveIP();
1269}
1270
1272 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1273 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1274 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1275 if (!updateToLocation(Loc))
1276 return Loc.IP;
1277
1278 Builder.restoreIP(AllocaIP);
1279 auto *KernelArgsPtr =
1280 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1282
1283 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1284 llvm::Value *Arg =
1285 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1286 Builder.CreateAlignedStore(
1287 KernelArgs[I], Arg,
1288 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1289 }
1290
1291 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1292 NumThreads, HostPtr, KernelArgsPtr};
1293
1295 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1296 OffloadingArgs);
1297
1298 return Builder.saveIP();
1299}
1300
1302 const LocationDescription &Loc, Value *OutlinedFnID,
1303 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1304 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1305
1306 if (!updateToLocation(Loc))
1307 return Loc.IP;
1308
1309 // On top of the arrays that were filled up, the target offloading call
1310 // takes as arguments the device id as well as the host pointer. The host
1311 // pointer is used by the runtime library to identify the current target
1312 // region, so it only has to be unique and not necessarily point to
1313 // anything. It could be the pointer to the outlined function that
1314 // implements the target region, but we aren't using that so that the
1315 // compiler doesn't need to keep that, and could therefore inline the host
1316 // function if proven worthwhile during optimization.
1317
1318 // From this point on, we need to have an ID of the target region defined.
1319 assert(OutlinedFnID && "Invalid outlined function ID!");
1320 (void)OutlinedFnID;
1321
1322 // Return value of the runtime offloading call.
1323 Value *Return = nullptr;
1324
1325 // Arguments for the target kernel.
1326 SmallVector<Value *> ArgsVector;
1327 getKernelArgsVector(Args, Builder, ArgsVector);
1328
1329 // The target region is an outlined function launched by the runtime
1330 // via calls to __tgt_target_kernel().
1331 //
1332 // Note that on the host and CPU targets, the runtime implementation of
1333 // these calls simply call the outlined function without forking threads.
1334 // The outlined functions themselves have runtime calls to
1335 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1336 // the compiler in emitTeamsCall() and emitParallelCall().
1337 //
1338 // In contrast, on the NVPTX target, the implementation of
1339 // __tgt_target_teams() launches a GPU kernel with the requested number
1340 // of teams and threads so no additional calls to the runtime are required.
1341 // Check the error code and execute the host version if required.
1342 Builder.restoreIP(emitTargetKernel(
1343 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1344 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1345
1346 BasicBlock *OffloadFailedBlock =
1347 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1348 BasicBlock *OffloadContBlock =
1349 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1350 Value *Failed = Builder.CreateIsNotNull(Return);
1351 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1352
1353 auto CurFn = Builder.GetInsertBlock()->getParent();
1354 emitBlock(OffloadFailedBlock, CurFn);
1355 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1356 if (!AfterIP)
1357 return AfterIP.takeError();
1358 Builder.restoreIP(*AfterIP);
1359 emitBranch(OffloadContBlock);
1360 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1361 return Builder.saveIP();
1362}
1363
1365 Value *CancelFlag, omp::Directive CanceledDirective) {
1366 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1367 "Unexpected cancellation!");
1368
1369 // For a cancel barrier we create two new blocks.
1370 BasicBlock *BB = Builder.GetInsertBlock();
1371 BasicBlock *NonCancellationBlock;
1372 if (Builder.GetInsertPoint() == BB->end()) {
1373 // TODO: This branch will not be needed once we moved to the
1374 // OpenMPIRBuilder codegen completely.
1375 NonCancellationBlock = BasicBlock::Create(
1376 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1377 } else {
1378 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1380 Builder.SetInsertPoint(BB);
1381 }
1382 BasicBlock *CancellationBlock = BasicBlock::Create(
1383 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1384
1385 // Jump to them based on the return value.
1386 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1387 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1388 /* TODO weight */ nullptr, nullptr);
1389
1390 // From the cancellation block we finalize all variables and go to the
1391 // post finalization block that is known to the FiniCB callback.
1392 auto &FI = FinalizationStack.back();
1393 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1394 if (!FiniBBOrErr)
1395 return FiniBBOrErr.takeError();
1396 Builder.SetInsertPoint(CancellationBlock);
1397 Builder.CreateBr(*FiniBBOrErr);
1398
1399 // The continuation block is where code generation continues.
1400 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1401 return Error::success();
1402}
1403
1404// Callback used to create OpenMP runtime calls to support
1405// omp parallel clause for the device.
1406// We need to use this callback to replace call to the OutlinedFn in OuterFn
1407// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1409 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1410 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1411 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1412 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1413 // Add some known attributes.
1414 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1415 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1416 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1417 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1418 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1419 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1420
1421 assert(OutlinedFn.arg_size() >= 2 &&
1422 "Expected at least tid and bounded tid as arguments");
1423 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1424
1425 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1426 assert(CI && "Expected call instruction to outlined function");
1427 CI->getParent()->setName("omp_parallel");
1428
1429 Builder.SetInsertPoint(CI);
1430 Type *PtrTy = OMPIRBuilder->VoidPtr;
1431 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1432
1433 // Add alloca for kernel args
1434 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1435 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1436 AllocaInst *ArgsAlloca =
1437 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1438 Value *Args = ArgsAlloca;
1439 // Add address space cast if array for storing arguments is not allocated
1440 // in address space 0
1441 if (ArgsAlloca->getAddressSpace())
1442 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1443 Builder.restoreIP(CurrentIP);
1444
1445 // Store captured vars which are used by kmpc_parallel_60
1446 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1447 Value *V = *(CI->arg_begin() + 2 + Idx);
1448 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1449 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1450 Builder.CreateStore(V, StoreAddress);
1451 }
1452
1453 Value *Cond =
1454 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1455 : Builder.getInt32(1);
1456
1457 // Build kmpc_parallel_60 call
1458 Value *Parallel60CallArgs[] = {
1459 /* identifier*/ Ident,
1460 /* global thread num*/ ThreadID,
1461 /* if expression */ Cond,
1462 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1463 /* Proc bind */ Builder.getInt32(-1),
1464 /* outlined function */ &OutlinedFn,
1465 /* wrapper function */ NullPtrValue,
1466 /* arguments of the outlined funciton*/ Args,
1467 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1468 /* strict for number of threads */ Builder.getInt32(0)};
1469
1470 FunctionCallee RTLFn =
1471 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1472
1473 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1474
1475 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1476 << *Builder.GetInsertBlock()->getParent() << "\n");
1477
1478 // Initialize the local TID stack location with the argument value.
1479 Builder.SetInsertPoint(PrivTID);
1480 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1481 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1482 PrivTIDAddr);
1483
1484 // Remove redundant call to the outlined function.
1485 CI->eraseFromParent();
1486
1487 for (Instruction *I : ToBeDeleted) {
1488 I->eraseFromParent();
1489 }
1490}
1491
1492// Callback used to create OpenMP runtime calls to support
1493// omp parallel clause for the host.
1494// We need to use this callback to replace call to the OutlinedFn in OuterFn
1495// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1496static void
1498 Function *OuterFn, Value *Ident, Value *IfCondition,
1499 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1500 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1501 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1502 FunctionCallee RTLFn;
1503 if (IfCondition) {
1504 RTLFn =
1505 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1506 } else {
1507 RTLFn =
1508 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1509 }
1510 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1511 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1512 LLVMContext &Ctx = F->getContext();
1513 MDBuilder MDB(Ctx);
1514 // Annotate the callback behavior of the __kmpc_fork_call:
1515 // - The callback callee is argument number 2 (microtask).
1516 // - The first two arguments of the callback callee are unknown (-1).
1517 // - All variadic arguments to the __kmpc_fork_call are passed to the
1518 // callback callee.
1519 F->addMetadata(LLVMContext::MD_callback,
1521 2, {-1, -1},
1522 /* VarArgsArePassed */ true)}));
1523 }
1524 }
1525 // Add some known attributes.
1526 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1527 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1528 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1529
1530 assert(OutlinedFn.arg_size() >= 2 &&
1531 "Expected at least tid and bounded tid as arguments");
1532 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1533
1534 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1535 CI->getParent()->setName("omp_parallel");
1536 Builder.SetInsertPoint(CI);
1537
1538 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1539 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1540 &OutlinedFn};
1541
1542 SmallVector<Value *, 16> RealArgs;
1543 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1544 if (IfCondition) {
1545 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1546 RealArgs.push_back(Cond);
1547 }
1548 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1549
1550 // __kmpc_fork_call_if always expects a void ptr as the last argument
1551 // If there are no arguments, pass a null pointer.
1552 auto PtrTy = OMPIRBuilder->VoidPtr;
1553 if (IfCondition && NumCapturedVars == 0) {
1554 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1555 RealArgs.push_back(NullPtrValue);
1556 }
1557
1558 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1559
1560 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1561 << *Builder.GetInsertBlock()->getParent() << "\n");
1562
1563 // Initialize the local TID stack location with the argument value.
1564 Builder.SetInsertPoint(PrivTID);
1565 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1566 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1567 PrivTIDAddr);
1568
1569 // Remove redundant call to the outlined function.
1570 CI->eraseFromParent();
1571
1572 for (Instruction *I : ToBeDeleted) {
1573 I->eraseFromParent();
1574 }
1575}
1576
1578 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1579 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1580 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1581 omp::ProcBindKind ProcBind, bool IsCancellable) {
1582 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1583
1584 if (!updateToLocation(Loc))
1585 return Loc.IP;
1586
1587 uint32_t SrcLocStrSize;
1588 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1589 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1590 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1591 (ProcBind != OMP_PROC_BIND_default);
1592 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1593 // If we generate code for the target device, we need to allocate
1594 // struct for aggregate params in the device default alloca address space.
1595 // OpenMP runtime requires that the params of the extracted functions are
1596 // passed as zero address space pointers. This flag ensures that extracted
1597 // function arguments are declared in zero address space
1598 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1599
1600 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1601 // only if we compile for host side.
1602 if (NumThreads && !Config.isTargetDevice()) {
1603 Value *Args[] = {
1604 Ident, ThreadID,
1605 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1607 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1608 }
1609
1610 if (ProcBind != OMP_PROC_BIND_default) {
1611 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1612 Value *Args[] = {
1613 Ident, ThreadID,
1614 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1616 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1617 }
1618
1619 BasicBlock *InsertBB = Builder.GetInsertBlock();
1620 Function *OuterFn = InsertBB->getParent();
1621
1622 // Save the outer alloca block because the insertion iterator may get
1623 // invalidated and we still need this later.
1624 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1625
1626 // Vector to remember instructions we used only during the modeling but which
1627 // we want to delete at the end.
1629
1630 // Change the location to the outer alloca insertion point to create and
1631 // initialize the allocas we pass into the parallel region.
1632 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1633 Builder.restoreIP(NewOuter);
1634 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1635 AllocaInst *ZeroAddrAlloca =
1636 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1637 Instruction *TIDAddr = TIDAddrAlloca;
1638 Instruction *ZeroAddr = ZeroAddrAlloca;
1639 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1640 // Add additional casts to enforce pointers in zero address space
1641 TIDAddr = new AddrSpaceCastInst(
1642 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1643 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1644 ToBeDeleted.push_back(TIDAddr);
1645 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1646 PointerType ::get(M.getContext(), 0),
1647 "zero.addr.ascast");
1648 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1649 ToBeDeleted.push_back(ZeroAddr);
1650 }
1651
1652 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1653 // associated arguments in the outlined function, so we delete them later.
1654 ToBeDeleted.push_back(TIDAddrAlloca);
1655 ToBeDeleted.push_back(ZeroAddrAlloca);
1656
1657 // Create an artificial insertion point that will also ensure the blocks we
1658 // are about to split are not degenerated.
1659 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1660
1661 BasicBlock *EntryBB = UI->getParent();
1662 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1663 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1664 BasicBlock *PRegPreFiniBB =
1665 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1666 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1667
1668 auto FiniCBWrapper = [&](InsertPointTy IP) {
1669 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1670 // target to the region exit block.
1671 if (IP.getBlock()->end() == IP.getPoint()) {
1673 Builder.restoreIP(IP);
1674 Instruction *I = Builder.CreateBr(PRegExitBB);
1675 IP = InsertPointTy(I->getParent(), I->getIterator());
1676 }
1677 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1678 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1679 "Unexpected insertion point for finalization call!");
1680 return FiniCB(IP);
1681 };
1682
1683 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1684
1685 // Generate the privatization allocas in the block that will become the entry
1686 // of the outlined function.
1687 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1688 InsertPointTy InnerAllocaIP = Builder.saveIP();
1689
1690 AllocaInst *PrivTIDAddr =
1691 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1692 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1693
1694 // Add some fake uses for OpenMP provided arguments.
1695 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1696 Instruction *ZeroAddrUse =
1697 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1698 ToBeDeleted.push_back(ZeroAddrUse);
1699
1700 // EntryBB
1701 // |
1702 // V
1703 // PRegionEntryBB <- Privatization allocas are placed here.
1704 // |
1705 // V
1706 // PRegionBodyBB <- BodeGen is invoked here.
1707 // |
1708 // V
1709 // PRegPreFiniBB <- The block we will start finalization from.
1710 // |
1711 // V
1712 // PRegionExitBB <- A common exit to simplify block collection.
1713 //
1714
1715 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1716
1717 // Let the caller create the body.
1718 assert(BodyGenCB && "Expected body generation callback!");
1719 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1720 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1721 return Err;
1722
1723 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1724
1725 OutlineInfo OI;
1726 if (Config.isTargetDevice()) {
1727 // Generate OpenMP target specific runtime call
1728 OI.PostOutlineCB = [=, ToBeDeletedVec =
1729 std::move(ToBeDeleted)](Function &OutlinedFn) {
1730 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1731 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1732 ThreadID, ToBeDeletedVec);
1733 };
1734 OI.FixUpNonEntryAllocas = true;
1735 } else {
1736 // Generate OpenMP host runtime call
1737 OI.PostOutlineCB = [=, ToBeDeletedVec =
1738 std::move(ToBeDeleted)](Function &OutlinedFn) {
1739 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1740 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1741 };
1742 OI.FixUpNonEntryAllocas = true;
1743 }
1744
1745 OI.OuterAllocaBB = OuterAllocaBlock;
1746 OI.EntryBB = PRegEntryBB;
1747 OI.ExitBB = PRegExitBB;
1748
1749 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1751 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1752
1753 CodeExtractorAnalysisCache CEAC(*OuterFn);
1754 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1755 /* AggregateArgs */ false,
1756 /* BlockFrequencyInfo */ nullptr,
1757 /* BranchProbabilityInfo */ nullptr,
1758 /* AssumptionCache */ nullptr,
1759 /* AllowVarArgs */ true,
1760 /* AllowAlloca */ true,
1761 /* AllocationBlock */ OuterAllocaBlock,
1762 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1763
1764 // Find inputs to, outputs from the code region.
1765 BasicBlock *CommonExit = nullptr;
1766 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1767 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1768
1769 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1770 /*CollectGlobalInputs=*/true);
1771
1772 Inputs.remove_if([&](Value *I) {
1774 return GV->getValueType() == OpenMPIRBuilder::Ident;
1775
1776 return false;
1777 });
1778
1779 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1780
1781 FunctionCallee TIDRTLFn =
1782 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1783
1784 auto PrivHelper = [&](Value &V) -> Error {
1785 if (&V == TIDAddr || &V == ZeroAddr) {
1787 return Error::success();
1788 }
1789
1791 for (Use &U : V.uses())
1792 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1793 if (ParallelRegionBlockSet.count(UserI->getParent()))
1794 Uses.insert(&U);
1795
1796 // __kmpc_fork_call expects extra arguments as pointers. If the input
1797 // already has a pointer type, everything is fine. Otherwise, store the
1798 // value onto stack and load it back inside the to-be-outlined region. This
1799 // will ensure only the pointer will be passed to the function.
1800 // FIXME: if there are more than 15 trailing arguments, they must be
1801 // additionally packed in a struct.
1802 Value *Inner = &V;
1803 if (!V.getType()->isPointerTy()) {
1805 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1806
1807 Builder.restoreIP(OuterAllocaIP);
1808 Value *Ptr =
1809 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1810
1811 // Store to stack at end of the block that currently branches to the entry
1812 // block of the to-be-outlined region.
1813 Builder.SetInsertPoint(InsertBB,
1814 InsertBB->getTerminator()->getIterator());
1815 Builder.CreateStore(&V, Ptr);
1816
1817 // Load back next to allocations in the to-be-outlined region.
1818 Builder.restoreIP(InnerAllocaIP);
1819 Inner = Builder.CreateLoad(V.getType(), Ptr);
1820 }
1821
1822 Value *ReplacementValue = nullptr;
1823 CallInst *CI = dyn_cast<CallInst>(&V);
1824 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1825 ReplacementValue = PrivTID;
1826 } else {
1827 InsertPointOrErrorTy AfterIP =
1828 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1829 if (!AfterIP)
1830 return AfterIP.takeError();
1831 Builder.restoreIP(*AfterIP);
1832 InnerAllocaIP = {
1833 InnerAllocaIP.getBlock(),
1834 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1835
1836 assert(ReplacementValue &&
1837 "Expected copy/create callback to set replacement value!");
1838 if (ReplacementValue == &V)
1839 return Error::success();
1840 }
1841
1842 for (Use *UPtr : Uses)
1843 UPtr->set(ReplacementValue);
1844
1845 return Error::success();
1846 };
1847
1848 // Reset the inner alloca insertion as it will be used for loading the values
1849 // wrapped into pointers before passing them into the to-be-outlined region.
1850 // Configure it to insert immediately after the fake use of zero address so
1851 // that they are available in the generated body and so that the
1852 // OpenMP-related values (thread ID and zero address pointers) remain leading
1853 // in the argument list.
1854 InnerAllocaIP = IRBuilder<>::InsertPoint(
1855 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1856
1857 // Reset the outer alloca insertion point to the entry of the relevant block
1858 // in case it was invalidated.
1859 OuterAllocaIP = IRBuilder<>::InsertPoint(
1860 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1861
1862 for (Value *Input : Inputs) {
1863 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1864 if (Error Err = PrivHelper(*Input))
1865 return Err;
1866 }
1867 LLVM_DEBUG({
1868 for (Value *Output : Outputs)
1869 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1870 });
1871 assert(Outputs.empty() &&
1872 "OpenMP outlining should not produce live-out values!");
1873
1874 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1875 LLVM_DEBUG({
1876 for (auto *BB : Blocks)
1877 dbgs() << " PBR: " << BB->getName() << "\n";
1878 });
1879
1880 // Adjust the finalization stack, verify the adjustment, and call the
1881 // finalize function a last time to finalize values between the pre-fini
1882 // block and the exit block if we left the parallel "the normal way".
1883 auto FiniInfo = FinalizationStack.pop_back_val();
1884 (void)FiniInfo;
1885 assert(FiniInfo.DK == OMPD_parallel &&
1886 "Unexpected finalization stack state!");
1887
1888 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1889
1890 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1891 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1892 if (!FiniBBOrErr)
1893 return FiniBBOrErr.takeError();
1894 {
1896 Builder.restoreIP(PreFiniIP);
1897 Builder.CreateBr(*FiniBBOrErr);
1898 // There's currently a branch to omp.par.exit. Delete it. We will get there
1899 // via the fini block
1900 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1901 Term->eraseFromParent();
1902 }
1903
1904 // Register the outlined info.
1905 addOutlineInfo(std::move(OI));
1906
1907 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1908 UI->eraseFromParent();
1909
1910 return AfterIP;
1911}
1912
1914 // Build call void __kmpc_flush(ident_t *loc)
1915 uint32_t SrcLocStrSize;
1916 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1917 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1918
1920 Args);
1921}
1922
1924 if (!updateToLocation(Loc))
1925 return;
1926 emitFlush(Loc);
1927}
1928
1930 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1931 // global_tid);
1932 uint32_t SrcLocStrSize;
1933 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1934 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1935 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1936
1937 // Ignore return result until untied tasks are supported.
1939 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1940}
1941
1947
1949 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1950 uint32_t SrcLocStrSize;
1951 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1952 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1953 Constant *I32Null = ConstantInt::getNullValue(Int32);
1954 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1955
1957 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1958}
1959
1965
1966// Processes the dependencies in Dependencies and does the following
1967// - Allocates space on the stack of an array of DependInfo objects
1968// - Populates each DependInfo object with relevant information of
1969// the corresponding dependence.
1970// - All code is inserted in the entry block of the current function.
1972 OpenMPIRBuilder &OMPBuilder,
1974 // Early return if we have no dependencies to process
1975 if (Dependencies.empty())
1976 return nullptr;
1977
1978 // Given a vector of DependData objects, in this function we create an
1979 // array on the stack that holds kmp_dep_info objects corresponding
1980 // to each dependency. This is then passed to the OpenMP runtime.
1981 // For example, if there are 'n' dependencies then the following psedo
1982 // code is generated. Assume the first dependence is on a variable 'a'
1983 //
1984 // \code{c}
1985 // DepArray = alloc(n x sizeof(kmp_depend_info);
1986 // idx = 0;
1987 // DepArray[idx].base_addr = ptrtoint(&a);
1988 // DepArray[idx].len = 8;
1989 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1990 // ++idx;
1991 // DepArray[idx].base_addr = ...;
1992 // \endcode
1993
1994 IRBuilderBase &Builder = OMPBuilder.Builder;
1995 Type *DependInfo = OMPBuilder.DependInfo;
1996 Module &M = OMPBuilder.M;
1997
1998 Value *DepArray = nullptr;
1999 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2000 Builder.SetInsertPoint(
2002
2003 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2004 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2005
2006 Builder.restoreIP(OldIP);
2007
2008 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2009 Value *Base =
2010 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2011 // Store the pointer to the variable
2012 Value *Addr = Builder.CreateStructGEP(
2013 DependInfo, Base,
2014 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2015 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2016 Builder.CreateStore(DepValPtr, Addr);
2017 // Store the size of the variable
2018 Value *Size = Builder.CreateStructGEP(
2019 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2020 Builder.CreateStore(
2021 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2022 Size);
2023 // Store the dependency kind
2024 Value *Flags = Builder.CreateStructGEP(
2025 DependInfo, Base,
2026 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2027 Builder.CreateStore(
2028 ConstantInt::get(Builder.getInt8Ty(),
2029 static_cast<unsigned int>(Dep.DepKind)),
2030 Flags);
2031 }
2032 return DepArray;
2033}
2034
2035/// Create the task duplication function passed to kmpc_taskloop.
2036Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2037 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2038 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2039 if (!DupCB)
2041 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2042
2043 // From OpenMP Runtime p_task_dup_t:
2044 // Routine optionally generated by the compiler for setting the lastprivate
2045 // flag and calling needed constructors for private/firstprivate objects (used
2046 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2047 // lastprivate flag.
2048 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2049
2050 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2051
2052 FunctionType *DupFuncTy = FunctionType::get(
2053 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2054 /*isVarArg=*/false);
2055
2056 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2057 "omp_taskloop_dup", M);
2058 Value *DestTaskArg = DupFunction->getArg(0);
2059 Value *SrcTaskArg = DupFunction->getArg(1);
2060 Value *LastprivateFlagArg = DupFunction->getArg(2);
2061 DestTaskArg->setName("dest_task");
2062 SrcTaskArg->setName("src_task");
2063 LastprivateFlagArg->setName("lastprivate_flag");
2064
2065 IRBuilderBase::InsertPointGuard Guard(Builder);
2066 Builder.SetInsertPoint(
2067 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2068
2069 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2070 Type *TaskWithPrivatesTy =
2071 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2072 Value *TaskPrivates = Builder.CreateGEP(
2073 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2074 Value *ContextPtr = Builder.CreateGEP(
2075 PrivatesTy, TaskPrivates,
2076 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2077 return ContextPtr;
2078 };
2079
2080 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2081 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2082
2083 DestTaskContextPtr->setName("destPtr");
2084 SrcTaskContextPtr->setName("srcPtr");
2085
2086 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2087 DupFunction->getEntryBlock().begin());
2088 InsertPointTy CodeGenIP = Builder.saveIP();
2089 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2090 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2091 if (!AfterIPOrError)
2092 return AfterIPOrError.takeError();
2093 Builder.restoreIP(*AfterIPOrError);
2094
2095 Builder.CreateRetVoid();
2096
2097 return DupFunction;
2098}
2099
2100OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2101 const LocationDescription &Loc, InsertPointTy AllocaIP,
2102 BodyGenCallbackTy BodyGenCB,
2103 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2104 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2105 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2106 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2107 Value *TaskContextStructPtrVal) {
2108
2109 if (!updateToLocation(Loc))
2110 return InsertPointTy();
2111
2112 uint32_t SrcLocStrSize;
2113 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2114 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2115
2116 BasicBlock *TaskloopExitBB =
2117 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2118 BasicBlock *TaskloopBodyBB =
2119 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2120 BasicBlock *TaskloopAllocaBB =
2121 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2122
2123 InsertPointTy TaskloopAllocaIP =
2124 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2125 InsertPointTy TaskloopBodyIP =
2126 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2127
2128 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2129 return Err;
2130
2131 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2132 if (!result) {
2133 return result.takeError();
2134 }
2135
2136 llvm::CanonicalLoopInfo *CLI = result.get();
2137 OutlineInfo OI;
2138 OI.EntryBB = TaskloopAllocaBB;
2139 OI.OuterAllocaBB = AllocaIP.getBlock();
2140 OI.ExitBB = TaskloopExitBB;
2141
2142 // Add the thread ID argument.
2143 SmallVector<Instruction *> ToBeDeleted;
2144 // dummy instruction to be used as a fake argument
2145 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2146 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2147 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2148 TaskloopAllocaIP, "lb", false, true);
2149 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2150 TaskloopAllocaIP, "ub", false, true);
2151 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2152 TaskloopAllocaIP, "step", false, true);
2153 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2154 // aggregate struct
2155 OI.Inputs.insert(FakeLB);
2156 OI.Inputs.insert(FakeUB);
2157 OI.Inputs.insert(FakeStep);
2158 if (TaskContextStructPtrVal)
2159 OI.Inputs.insert(TaskContextStructPtrVal);
2160 assert(((TaskContextStructPtrVal && DupCB) ||
2161 (!TaskContextStructPtrVal && !DupCB)) &&
2162 "Task context struct ptr and duplication callback must be both set "
2163 "or both null");
2164
2165 // It isn't safe to run the duplication bodygen callback inside the post
2166 // outlining callback so this has to be run now before we know the real task
2167 // shareds structure type.
2168 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2169 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2170 Type *FakeSharedsTy = StructType::get(
2171 Builder.getContext(),
2172 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2173 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2174 FakeSharedsTy,
2175 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2176 if (!TaskDupFnOrErr) {
2177 return TaskDupFnOrErr.takeError();
2178 }
2179 Value *TaskDupFn = *TaskDupFnOrErr;
2180
2181 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2182 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2183 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2184 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2185 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2186 // Replace the Stale CI by appropriate RTL function call.
2187 assert(OutlinedFn.hasOneUse() &&
2188 "there must be a single user for the outlined function");
2189 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2190
2191 /* Create the casting for the Bounds Values that can be used when outlining
2192 * to replace the uses of the fakes with real values */
2193 BasicBlock *CodeReplBB = StaleCI->getParent();
2194 IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
2195 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2196 Value *CastedLBVal =
2197 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2198 Value *CastedUBVal =
2199 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2200 Value *CastedStepVal =
2201 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2202 Builder.restoreIP(CurrentIp);
2203
2204 Builder.SetInsertPoint(StaleCI);
2205
2206 // Gather the arguments for emitting the runtime call for
2207 // @__kmpc_omp_task_alloc
2208 Function *TaskAllocFn =
2209 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2210
2211 Value *ThreadID = getOrCreateThreadID(Ident);
2212
2213 if (!NoGroup) {
2214 // Emit runtime call for @__kmpc_taskgroup
2215 Function *TaskgroupFn =
2216 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2217 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2218 }
2219
2220 // `flags` Argument Configuration
2221 // Task is tied if (Flags & 1) == 1.
2222 // Task is untied if (Flags & 1) == 0.
2223 // Task is final if (Flags & 2) == 2.
2224 // Task is not final if (Flags & 2) == 0.
2225 // Task is mergeable if (Flags & 4) == 4.
2226 // Task is not mergeable if (Flags & 4) == 0.
2227 // Task is priority if (Flags & 32) == 32.
2228 // Task is not priority if (Flags & 32) == 0.
2229 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2230 if (Final)
2231 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2232 if (Mergeable)
2233 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2234 if (Priority)
2235 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2236
2237 Value *TaskSize = Builder.getInt64(
2238 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2239
2240 AllocaInst *ArgStructAlloca =
2242 assert(ArgStructAlloca &&
2243 "Unable to find the alloca instruction corresponding to arguments "
2244 "for extracted function");
2245 std::optional<TypeSize> ArgAllocSize =
2246 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2247 assert(ArgAllocSize &&
2248 "Unable to determine size of arguments for extracted function");
2249 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2250
2251 // Emit the @__kmpc_omp_task_alloc runtime call
2252 // The runtime call returns a pointer to an area where the task captured
2253 // variables must be copied before the task is run (TaskData)
2254 CallInst *TaskData = Builder.CreateCall(
2255 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2256 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2257 /*task_func=*/&OutlinedFn});
2258
2259 Value *Shareds = StaleCI->getArgOperand(1);
2260 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2261 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2262 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2263 SharedsSize);
2264 // Get the pointer to loop lb, ub, step from task ptr
2265 // and set up the lowerbound,upperbound and step values
2266 llvm::Value *Lb = Builder.CreateGEP(
2267 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2268
2269 llvm::Value *Ub = Builder.CreateGEP(
2270 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2271
2272 llvm::Value *Step = Builder.CreateGEP(
2273 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2274 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2275
2276 // set up the arguments for emitting kmpc_taskloop runtime call
2277 // setting values for ifval, nogroup, sched, grainsize, task_dup
2278 Value *IfCondVal =
2279 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2280 : Builder.getInt32(1);
2281 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2282 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2283 Value *NoGroupVal = Builder.getInt32(1);
2284 Value *SchedVal = Builder.getInt32(Sched);
2285 Value *GrainSizeVal =
2286 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2287 : Builder.getInt64(0);
2288 Value *TaskDup = TaskDupFn;
2289
2290 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2291 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2292
2293 // taskloop runtime call
2294 Function *TaskloopFn =
2295 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2296 Builder.CreateCall(TaskloopFn, Args);
2297
2298 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2299 // nogroup is not defined
2300 if (!NoGroup) {
2301 Function *EndTaskgroupFn =
2302 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2303 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2304 }
2305
2306 StaleCI->eraseFromParent();
2307
2308 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2309
2310 LoadInst *SharedsOutlined =
2311 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2312 OutlinedFn.getArg(1)->replaceUsesWithIf(
2313 SharedsOutlined,
2314 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2315
2316 Value *IV = CLI->getIndVar();
2317 Type *IVTy = IV->getType();
2318 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2319
2320 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2321 // UpperBound. These GEP's can be reused for loading the tasks respective
2322 // bounds.
2323 Value *TaskLB = nullptr;
2324 Value *TaskUB = nullptr;
2325 Value *LoadTaskLB = nullptr;
2326 Value *LoadTaskUB = nullptr;
2327 for (Instruction &I : *TaskloopAllocaBB) {
2328 if (I.getOpcode() == Instruction::GetElementPtr) {
2329 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2330 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2331 switch (CI->getZExtValue()) {
2332 case 0:
2333 TaskLB = &I;
2334 break;
2335 case 1:
2336 TaskUB = &I;
2337 break;
2338 }
2339 }
2340 } else if (I.getOpcode() == Instruction::Load) {
2341 LoadInst &Load = cast<LoadInst>(I);
2342 if (Load.getPointerOperand() == TaskLB) {
2343 assert(TaskLB != nullptr && "Expected value for TaskLB");
2344 LoadTaskLB = &I;
2345 } else if (Load.getPointerOperand() == TaskUB) {
2346 assert(TaskUB != nullptr && "Expected value for TaskUB");
2347 LoadTaskUB = &I;
2348 }
2349 }
2350 }
2351
2352 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2353
2354 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2355 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2356 Value *TripCountMinusOne =
2357 Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
2358 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2359 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2360 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2361 // set the trip count in the CLI
2362 CLI->setTripCount(CastedTripCount);
2363
2364 Builder.SetInsertPoint(CLI->getBody(),
2365 CLI->getBody()->getFirstInsertionPt());
2366
2367 if (NumOfCollapseLoops > 1) {
2368 llvm::SmallVector<User *> UsersToReplace;
2369 // When using the collapse clause, the bounds of the loop have to be
2370 // adjusted to properly represent the iterator of the outer loop.
2371 Value *IVPlusTaskLB = Builder.CreateAdd(
2372 CLI->getIndVar(),
2373 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2374 // To ensure every Use is correctly captured, we first want to record
2375 // which users to replace the value in, and then replace the value.
2376 for (auto IVUse = CLI->getIndVar()->uses().begin();
2377 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2378 User *IVUser = IVUse->getUser();
2379 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2380 if (Op->getOpcode() == Instruction::URem ||
2381 Op->getOpcode() == Instruction::UDiv) {
2382 UsersToReplace.push_back(IVUser);
2383 }
2384 }
2385 }
2386 for (User *User : UsersToReplace) {
2387 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2388 }
2389 } else {
2390 // The canonical loop is generated with a fixed lower bound. We need to
2391 // update the index calculation code to use the task's lower bound. The
2392 // generated code looks like this:
2393 // %omp_loop.iv = phi ...
2394 // ...
2395 // %tmp = mul [type] %omp_loop.iv, step
2396 // %user_index = add [type] tmp, lb
2397 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2398 // of the normalised induction variable:
2399 // 1. This one: converting the normalised IV to the user IV
2400 // 2. The increment (add)
2401 // 3. The comparison against the trip count (icmp)
2402 // (1) is the only use that is a mul followed by an add so this cannot
2403 // match other IR.
2404 assert(CLI->getIndVar()->getNumUses() == 3 &&
2405 "Canonical loop should have exactly three uses of the ind var");
2406 for (User *IVUser : CLI->getIndVar()->users()) {
2407 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2408 if (Mul->getOpcode() == Instruction::Mul) {
2409 for (User *MulUser : Mul->users()) {
2410 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2411 if (Add->getOpcode() == Instruction::Add) {
2412 Add->setOperand(1, CastedTaskLB);
2413 }
2414 }
2415 }
2416 }
2417 }
2418 }
2419 }
2420
2421 FakeLB->replaceAllUsesWith(CastedLBVal);
2422 FakeUB->replaceAllUsesWith(CastedUBVal);
2423 FakeStep->replaceAllUsesWith(CastedStepVal);
2424 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2425 I->eraseFromParent();
2426 }
2427 };
2428
2429 addOutlineInfo(std::move(OI));
2430 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2431 return Builder.saveIP();
2432}
2433
2435 const LocationDescription &Loc, InsertPointTy AllocaIP,
2436 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2437 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
2438 Value *Priority) {
2439
2440 if (!updateToLocation(Loc))
2441 return InsertPointTy();
2442
2443 uint32_t SrcLocStrSize;
2444 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2445 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2446 // The current basic block is split into four basic blocks. After outlining,
2447 // they will be mapped as follows:
2448 // ```
2449 // def current_fn() {
2450 // current_basic_block:
2451 // br label %task.exit
2452 // task.exit:
2453 // ; instructions after task
2454 // }
2455 // def outlined_fn() {
2456 // task.alloca:
2457 // br label %task.body
2458 // task.body:
2459 // ret void
2460 // }
2461 // ```
2462 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2463 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2464 BasicBlock *TaskAllocaBB =
2465 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2466
2467 InsertPointTy TaskAllocaIP =
2468 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2469 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2470 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2471 return Err;
2472
2473 OutlineInfo OI;
2474 OI.EntryBB = TaskAllocaBB;
2475 OI.OuterAllocaBB = AllocaIP.getBlock();
2476 OI.ExitBB = TaskExitBB;
2477
2478 // Add the thread ID argument.
2481 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2482
2483 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2484 Mergeable, Priority, EventHandle, TaskAllocaBB,
2485 ToBeDeleted](Function &OutlinedFn) mutable {
2486 // Replace the Stale CI by appropriate RTL function call.
2487 assert(OutlinedFn.hasOneUse() &&
2488 "there must be a single user for the outlined function");
2489 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2490
2491 // HasShareds is true if any variables are captured in the outlined region,
2492 // false otherwise.
2493 bool HasShareds = StaleCI->arg_size() > 1;
2494 Builder.SetInsertPoint(StaleCI);
2495
2496 // Gather the arguments for emitting the runtime call for
2497 // @__kmpc_omp_task_alloc
2498 Function *TaskAllocFn =
2499 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2500
2501 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2502 // call.
2503 Value *ThreadID = getOrCreateThreadID(Ident);
2504
2505 // Argument - `flags`
2506 // Task is tied iff (Flags & 1) == 1.
2507 // Task is untied iff (Flags & 1) == 0.
2508 // Task is final iff (Flags & 2) == 2.
2509 // Task is not final iff (Flags & 2) == 0.
2510 // Task is mergeable iff (Flags & 4) == 4.
2511 // Task is not mergeable iff (Flags & 4) == 0.
2512 // Task is priority iff (Flags & 32) == 32.
2513 // Task is not priority iff (Flags & 32) == 0.
2514 // TODO: Handle the other flags.
2515 Value *Flags = Builder.getInt32(Tied);
2516 if (Final) {
2517 Value *FinalFlag =
2518 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2519 Flags = Builder.CreateOr(FinalFlag, Flags);
2520 }
2521
2522 if (Mergeable)
2523 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2524 if (Priority)
2525 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2526
2527 // Argument - `sizeof_kmp_task_t` (TaskSize)
2528 // Tasksize refers to the size in bytes of kmp_task_t data structure
2529 // including private vars accessed in task.
2530 // TODO: add kmp_task_t_with_privates (privates)
2531 Value *TaskSize = Builder.getInt64(
2532 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2533
2534 // Argument - `sizeof_shareds` (SharedsSize)
2535 // SharedsSize refers to the shareds array size in the kmp_task_t data
2536 // structure.
2537 Value *SharedsSize = Builder.getInt64(0);
2538 if (HasShareds) {
2539 AllocaInst *ArgStructAlloca =
2541 assert(ArgStructAlloca &&
2542 "Unable to find the alloca instruction corresponding to arguments "
2543 "for extracted function");
2544 std::optional<TypeSize> ArgAllocSize =
2545 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2546 assert(ArgAllocSize &&
2547 "Unable to determine size of arguments for extracted function");
2548 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2549 }
2550 // Emit the @__kmpc_omp_task_alloc runtime call
2551 // The runtime call returns a pointer to an area where the task captured
2552 // variables must be copied before the task is run (TaskData)
2554 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2555 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2556 /*task_func=*/&OutlinedFn});
2557
2558 // Emit detach clause initialization.
2559 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2560 // task_descriptor);
2561 if (EventHandle) {
2563 OMPRTL___kmpc_task_allow_completion_event);
2564 llvm::Value *EventVal =
2565 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2566 llvm::Value *EventHandleAddr =
2567 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2568 Builder.getPtrTy(0));
2569 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2570 Builder.CreateStore(EventVal, EventHandleAddr);
2571 }
2572 // Copy the arguments for outlined function
2573 if (HasShareds) {
2574 Value *Shareds = StaleCI->getArgOperand(1);
2575 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2576 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2577 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2578 SharedsSize);
2579 }
2580
2581 if (Priority) {
2582 //
2583 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2584 // we populate the priority information into the "kmp_task_t" here
2585 //
2586 // The struct "kmp_task_t" definition is available in kmp.h
2587 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2588 // data2 is used for priority
2589 //
2590 Type *Int32Ty = Builder.getInt32Ty();
2591 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2592 // kmp_task_t* => { ptr }
2593 Type *TaskPtr = StructType::get(VoidPtr);
2594 Value *TaskGEP =
2595 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2596 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2597 Type *TaskStructType = StructType::get(
2598 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2599 Value *PriorityData = Builder.CreateInBoundsGEP(
2600 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2601 // kmp_cmplrdata_t => { ptr, ptr }
2602 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2603 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2604 PriorityData, {Zero, Zero});
2605 Builder.CreateStore(Priority, CmplrData);
2606 }
2607
2608 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2609
2610 // In the presence of the `if` clause, the following IR is generated:
2611 // ...
2612 // %data = call @__kmpc_omp_task_alloc(...)
2613 // br i1 %if_condition, label %then, label %else
2614 // then:
2615 // call @__kmpc_omp_task(...)
2616 // br label %exit
2617 // else:
2618 // ;; Wait for resolution of dependencies, if any, before
2619 // ;; beginning the task
2620 // call @__kmpc_omp_wait_deps(...)
2621 // call @__kmpc_omp_task_begin_if0(...)
2622 // call @outlined_fn(...)
2623 // call @__kmpc_omp_task_complete_if0(...)
2624 // br label %exit
2625 // exit:
2626 // ...
2627 if (IfCondition) {
2628 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2629 // terminator.
2630 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2631 Instruction *IfTerminator =
2632 Builder.GetInsertPoint()->getParent()->getTerminator();
2633 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2634 Builder.SetInsertPoint(IfTerminator);
2635 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2636 &ElseTI);
2637 Builder.SetInsertPoint(ElseTI);
2638
2639 if (Dependencies.size()) {
2640 Function *TaskWaitFn =
2641 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2643 TaskWaitFn,
2644 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2645 ConstantInt::get(Builder.getInt32Ty(), 0),
2647 }
2648 Function *TaskBeginFn =
2649 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2650 Function *TaskCompleteFn =
2651 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2652 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2653 CallInst *CI = nullptr;
2654 if (HasShareds)
2655 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2656 else
2657 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2658 CI->setDebugLoc(StaleCI->getDebugLoc());
2659 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2660 Builder.SetInsertPoint(ThenTI);
2661 }
2662
2663 if (Dependencies.size()) {
2664 Function *TaskFn =
2665 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2667 TaskFn,
2668 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2669 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2671
2672 } else {
2673 // Emit the @__kmpc_omp_task runtime call to spawn the task
2674 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2675 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2676 }
2677
2678 StaleCI->eraseFromParent();
2679
2680 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2681 if (HasShareds) {
2682 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2683 OutlinedFn.getArg(1)->replaceUsesWithIf(
2684 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2685 }
2686
2687 for (Instruction *I : llvm::reverse(ToBeDeleted))
2688 I->eraseFromParent();
2689 };
2690
2691 addOutlineInfo(std::move(OI));
2692 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2693
2694 return Builder.saveIP();
2695}
2696
2699 InsertPointTy AllocaIP,
2700 BodyGenCallbackTy BodyGenCB) {
2701 if (!updateToLocation(Loc))
2702 return InsertPointTy();
2703
2704 uint32_t SrcLocStrSize;
2705 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2706 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2707 Value *ThreadID = getOrCreateThreadID(Ident);
2708
2709 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2710 Function *TaskgroupFn =
2711 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2712 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2713
2714 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2715 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2716 return Err;
2717
2718 Builder.SetInsertPoint(TaskgroupExitBB);
2719 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2720 Function *EndTaskgroupFn =
2721 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2722 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2723
2724 return Builder.saveIP();
2725}
2726
2728 const LocationDescription &Loc, InsertPointTy AllocaIP,
2730 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2731 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2732
2733 if (!updateToLocation(Loc))
2734 return Loc.IP;
2735
2736 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2737
2738 // Each section is emitted as a switch case
2739 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2740 // -> OMP.createSection() which generates the IR for each section
2741 // Iterate through all sections and emit a switch construct:
2742 // switch (IV) {
2743 // case 0:
2744 // <SectionStmt[0]>;
2745 // break;
2746 // ...
2747 // case <NumSection> - 1:
2748 // <SectionStmt[<NumSection> - 1]>;
2749 // break;
2750 // }
2751 // ...
2752 // section_loop.after:
2753 // <FiniCB>;
2754 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2755 Builder.restoreIP(CodeGenIP);
2757 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2758 Function *CurFn = Continue->getParent();
2759 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2760
2761 unsigned CaseNumber = 0;
2762 for (auto SectionCB : SectionCBs) {
2764 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2765 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2766 Builder.SetInsertPoint(CaseBB);
2767 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2768 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2769 CaseEndBr->getIterator()}))
2770 return Err;
2771 CaseNumber++;
2772 }
2773 // remove the existing terminator from body BB since there can be no
2774 // terminators after switch/case
2775 return Error::success();
2776 };
2777 // Loop body ends here
2778 // LowerBound, UpperBound, and STride for createCanonicalLoop
2779 Type *I32Ty = Type::getInt32Ty(M.getContext());
2780 Value *LB = ConstantInt::get(I32Ty, 0);
2781 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2782 Value *ST = ConstantInt::get(I32Ty, 1);
2784 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2785 if (!LoopInfo)
2786 return LoopInfo.takeError();
2787
2788 InsertPointOrErrorTy WsloopIP =
2789 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2790 WorksharingLoopType::ForStaticLoop, !IsNowait);
2791 if (!WsloopIP)
2792 return WsloopIP.takeError();
2793 InsertPointTy AfterIP = *WsloopIP;
2794
2795 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2796 assert(LoopFini && "Bad structure of static workshare loop finalization");
2797
2798 // Apply the finalization callback in LoopAfterBB
2799 auto FiniInfo = FinalizationStack.pop_back_val();
2800 assert(FiniInfo.DK == OMPD_sections &&
2801 "Unexpected finalization stack state!");
2802 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2803 return Err;
2804
2805 return AfterIP;
2806}
2807
2810 BodyGenCallbackTy BodyGenCB,
2811 FinalizeCallbackTy FiniCB) {
2812 if (!updateToLocation(Loc))
2813 return Loc.IP;
2814
2815 auto FiniCBWrapper = [&](InsertPointTy IP) {
2816 if (IP.getBlock()->end() != IP.getPoint())
2817 return FiniCB(IP);
2818 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2819 // will fail because that function requires the Finalization Basic Block to
2820 // have a terminator, which is already removed by EmitOMPRegionBody.
2821 // IP is currently at cancelation block.
2822 // We need to backtrack to the condition block to fetch
2823 // the exit block and create a branch from cancelation
2824 // to exit block.
2826 Builder.restoreIP(IP);
2827 auto *CaseBB = Loc.IP.getBlock();
2828 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2829 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2830 Instruction *I = Builder.CreateBr(ExitBB);
2831 IP = InsertPointTy(I->getParent(), I->getIterator());
2832 return FiniCB(IP);
2833 };
2834
2835 Directive OMPD = Directive::OMPD_sections;
2836 // Since we are using Finalization Callback here, HasFinalize
2837 // and IsCancellable have to be true
2838 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2839 /*Conditional*/ false, /*hasFinalize*/ true,
2840 /*IsCancellable*/ true);
2841}
2842
2848
2849Value *OpenMPIRBuilder::getGPUThreadID() {
2852 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2853 {});
2854}
2855
2856Value *OpenMPIRBuilder::getGPUWarpSize() {
2858 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2859}
2860
2861Value *OpenMPIRBuilder::getNVPTXWarpID() {
2862 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2863 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2864}
2865
2866Value *OpenMPIRBuilder::getNVPTXLaneID() {
2867 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2868 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2869 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2870 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2871 "nvptx_lane_id");
2872}
2873
2874Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2875 Type *ToType) {
2876 Type *FromType = From->getType();
2877 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2878 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2879 assert(FromSize > 0 && "From size must be greater than zero");
2880 assert(ToSize > 0 && "To size must be greater than zero");
2881 if (FromType == ToType)
2882 return From;
2883 if (FromSize == ToSize)
2884 return Builder.CreateBitCast(From, ToType);
2885 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2886 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2887 InsertPointTy SaveIP = Builder.saveIP();
2888 Builder.restoreIP(AllocaIP);
2889 Value *CastItem = Builder.CreateAlloca(ToType);
2890 Builder.restoreIP(SaveIP);
2891
2892 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2893 CastItem, Builder.getPtrTy(0));
2894 Builder.CreateStore(From, ValCastItem);
2895 return Builder.CreateLoad(ToType, CastItem);
2896}
2897
2898Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2899 Value *Element,
2900 Type *ElementType,
2901 Value *Offset) {
2902 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2903 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2904
2905 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2906 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2907 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2908 Value *WarpSize =
2909 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2911 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2912 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2913 Value *WarpSizeCast =
2914 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2915 Value *ShuffleCall =
2916 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2917 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2918}
2919
2920void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2921 Value *DstAddr, Type *ElemType,
2922 Value *Offset, Type *ReductionArrayTy,
2923 bool IsByRefElem) {
2924 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2925 // Create the loop over the big sized data.
2926 // ptr = (void*)Elem;
2927 // ptrEnd = (void*) Elem + 1;
2928 // Step = 8;
2929 // while (ptr + Step < ptrEnd)
2930 // shuffle((int64_t)*ptr);
2931 // Step = 4;
2932 // while (ptr + Step < ptrEnd)
2933 // shuffle((int32_t)*ptr);
2934 // ...
2935 Type *IndexTy = Builder.getIndexTy(
2936 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2937 Value *ElemPtr = DstAddr;
2938 Value *Ptr = SrcAddr;
2939 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2940 if (Size < IntSize)
2941 continue;
2942 Type *IntType = Builder.getIntNTy(IntSize * 8);
2943 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2944 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2945 Value *SrcAddrGEP =
2946 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2947 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2948 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2949
2950 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2951 if ((Size / IntSize) > 1) {
2952 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2953 SrcAddrGEP, Builder.getPtrTy());
2954 BasicBlock *PreCondBB =
2955 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2956 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2957 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2958 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2959 emitBlock(PreCondBB, CurFunc);
2960 PHINode *PhiSrc =
2961 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2962 PhiSrc->addIncoming(Ptr, CurrentBB);
2963 PHINode *PhiDest =
2964 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2965 PhiDest->addIncoming(ElemPtr, CurrentBB);
2966 Ptr = PhiSrc;
2967 ElemPtr = PhiDest;
2968 Value *PtrDiff = Builder.CreatePtrDiff(
2969 Builder.getInt8Ty(), PtrEnd,
2970 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2971 Builder.CreateCondBr(
2972 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2973 ExitBB);
2974 emitBlock(ThenBB, CurFunc);
2975 Value *Res = createRuntimeShuffleFunction(
2976 AllocaIP,
2977 Builder.CreateAlignedLoad(
2978 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2979 IntType, Offset);
2980 Builder.CreateAlignedStore(Res, ElemPtr,
2981 M.getDataLayout().getPrefTypeAlign(ElemType));
2982 Value *LocalPtr =
2983 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2984 Value *LocalElemPtr =
2985 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2986 PhiSrc->addIncoming(LocalPtr, ThenBB);
2987 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2988 emitBranch(PreCondBB);
2989 emitBlock(ExitBB, CurFunc);
2990 } else {
2991 Value *Res = createRuntimeShuffleFunction(
2992 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2993 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2994 Res->getType()->getScalarSizeInBits())
2995 Res = Builder.CreateTrunc(Res, ElemType);
2996 Builder.CreateStore(Res, ElemPtr);
2997 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2998 ElemPtr =
2999 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3000 }
3001 Size = Size % IntSize;
3002 }
3003}
3004
3005Error OpenMPIRBuilder::emitReductionListCopy(
3006 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3007 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3008 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3009 Type *IndexTy = Builder.getIndexTy(
3010 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3011 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3012
3013 // Iterates, element-by-element, through the source Reduce list and
3014 // make a copy.
3015 for (auto En : enumerate(ReductionInfos)) {
3016 const ReductionInfo &RI = En.value();
3017 Value *SrcElementAddr = nullptr;
3018 AllocaInst *DestAlloca = nullptr;
3019 Value *DestElementAddr = nullptr;
3020 Value *DestElementPtrAddr = nullptr;
3021 // Should we shuffle in an element from a remote lane?
3022 bool ShuffleInElement = false;
3023 // Set to true to update the pointer in the dest Reduce list to a
3024 // newly created element.
3025 bool UpdateDestListPtr = false;
3026
3027 // Step 1.1: Get the address for the src element in the Reduce list.
3028 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3029 ReductionArrayTy, SrcBase,
3030 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3031 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3032
3033 // Step 1.2: Create a temporary to store the element in the destination
3034 // Reduce list.
3035 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3036 ReductionArrayTy, DestBase,
3037 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3038 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3039 switch (Action) {
3041 InsertPointTy CurIP = Builder.saveIP();
3042 Builder.restoreIP(AllocaIP);
3043
3044 Type *DestAllocaType =
3045 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3046 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3047 ".omp.reduction.element");
3048 DestAlloca->setAlignment(
3049 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3050 DestElementAddr = DestAlloca;
3051 DestElementAddr =
3052 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3053 DestElementAddr->getName() + ".ascast");
3054 Builder.restoreIP(CurIP);
3055 ShuffleInElement = true;
3056 UpdateDestListPtr = true;
3057 break;
3058 }
3060 DestElementAddr =
3061 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3062 break;
3063 }
3064 }
3065
3066 // Now that all active lanes have read the element in the
3067 // Reduce list, shuffle over the value from the remote lane.
3068 if (ShuffleInElement) {
3069 Type *ShuffleType = RI.ElementType;
3070 Value *ShuffleSrcAddr = SrcElementAddr;
3071 Value *ShuffleDestAddr = DestElementAddr;
3072 AllocaInst *LocalStorage = nullptr;
3073
3074 if (IsByRefElem) {
3075 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3076 assert(RI.ByRefAllocatedType &&
3077 "Expected by-ref allocated type to be set");
3078 // For by-ref reductions, we need to copy from the remote lane the
3079 // actual value of the partial reduction computed by that remote lane;
3080 // rather than, for example, a pointer to that data or, even worse, a
3081 // pointer to the descriptor of the by-ref reduction element.
3082 ShuffleType = RI.ByRefElementType;
3083
3084 InsertPointOrErrorTy GenResult =
3085 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3086
3087 if (!GenResult)
3088 return GenResult.takeError();
3089
3090 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3091
3092 {
3093 InsertPointTy OldIP = Builder.saveIP();
3094 Builder.restoreIP(AllocaIP);
3095
3096 LocalStorage = Builder.CreateAlloca(ShuffleType);
3097 Builder.restoreIP(OldIP);
3098 ShuffleDestAddr = LocalStorage;
3099 }
3100 }
3101
3102 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3103 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3104
3105 if (IsByRefElem) {
3106 // Copy descriptor from source and update base_ptr to shuffled data
3107 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3108 DestAlloca, Builder.getPtrTy(), ".ascast");
3109
3110 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3111 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3112 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3113
3114 if (!GenResult)
3115 return GenResult.takeError();
3116 }
3117 } else {
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3121 // Store the source element value to the dest element address.
3122 Builder.CreateStore(Elem, DestElementAddr);
3123 break;
3124 }
3125 case EvalKind::Complex: {
3126 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3127 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3128 Value *SrcReal = Builder.CreateLoad(
3129 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3130 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3131 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3132 Value *SrcImg = Builder.CreateLoad(
3133 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3134
3135 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3136 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3137 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3138 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3139 Builder.CreateStore(SrcReal, DestRealPtr);
3140 Builder.CreateStore(SrcImg, DestImgPtr);
3141 break;
3142 }
3143 case EvalKind::Aggregate: {
3144 Value *SizeVal = Builder.getInt64(
3145 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3146 Builder.CreateMemCpy(
3147 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3148 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3149 SizeVal, false);
3150 break;
3151 }
3152 };
3153 }
3154
3155 // Step 3.1: Modify reference in dest Reduce list as needed.
3156 // Modifying the reference in Reduce list to point to the newly
3157 // created element. The element is live in the current function
3158 // scope and that of functions it invokes (i.e., reduce_function).
3159 // RemoteReduceData[i] = (void*)&RemoteElem
3160 if (UpdateDestListPtr) {
3161 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3162 DestElementAddr, Builder.getPtrTy(),
3163 DestElementAddr->getName() + ".ascast");
3164 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3165 }
3166 }
3167
3168 return Error::success();
3169}
3170
3171Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3172 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3173 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3174 InsertPointTy SavedIP = Builder.saveIP();
3175 LLVMContext &Ctx = M.getContext();
3176 FunctionType *FuncTy = FunctionType::get(
3177 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3178 /* IsVarArg */ false);
3179 Function *WcFunc =
3181 "_omp_reduction_inter_warp_copy_func", &M);
3182 WcFunc->setAttributes(FuncAttrs);
3183 WcFunc->addParamAttr(0, Attribute::NoUndef);
3184 WcFunc->addParamAttr(1, Attribute::NoUndef);
3185 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3186 Builder.SetInsertPoint(EntryBB);
3187
3188 // ReduceList: thread local Reduce list.
3189 // At the stage of the computation when this function is called, partially
3190 // aggregated values reside in the first lane of every active warp.
3191 Argument *ReduceListArg = WcFunc->getArg(0);
3192 // NumWarps: number of warps active in the parallel region. This could
3193 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3194 Argument *NumWarpsArg = WcFunc->getArg(1);
3195
3196 // This array is used as a medium to transfer, one reduce element at a time,
3197 // the data from the first lane of every warp to lanes in the first warp
3198 // in order to perform the final step of a reduction in a parallel region
3199 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3200 // for reduced latency, as well as to have a distinct copy for concurrently
3201 // executing target regions. The array is declared with common linkage so
3202 // as to be shared across compilation units.
3203 StringRef TransferMediumName =
3204 "__openmp_nvptx_data_transfer_temporary_storage";
3205 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3206 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3207 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3208 if (!TransferMedium) {
3209 TransferMedium = new GlobalVariable(
3210 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3211 UndefValue::get(ArrayTy), TransferMediumName,
3212 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3213 /*AddressSpace=*/3);
3214 }
3215
3216 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3217 Value *GPUThreadID = getGPUThreadID();
3218 // nvptx_lane_id = nvptx_id % warpsize
3219 Value *LaneID = getNVPTXLaneID();
3220 // nvptx_warp_id = nvptx_id / warpsize
3221 Value *WarpID = getNVPTXWarpID();
3222
3223 InsertPointTy AllocaIP =
3224 InsertPointTy(Builder.GetInsertBlock(),
3225 Builder.GetInsertBlock()->getFirstInsertionPt());
3226 Type *Arg0Type = ReduceListArg->getType();
3227 Type *Arg1Type = NumWarpsArg->getType();
3228 Builder.restoreIP(AllocaIP);
3229 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3230 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3231 AllocaInst *NumWarpsAlloca =
3232 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3233 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3234 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3235 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3236 NumWarpsAlloca, Builder.getPtrTy(0),
3237 NumWarpsAlloca->getName() + ".ascast");
3238 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3239 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3240 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3241 InsertPointTy CodeGenIP =
3242 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3243 Builder.restoreIP(CodeGenIP);
3244
3245 Value *ReduceList =
3246 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3247
3248 for (auto En : enumerate(ReductionInfos)) {
3249 //
3250 // Warp master copies reduce element to transfer medium in __shared__
3251 // memory.
3252 //
3253 const ReductionInfo &RI = En.value();
3254 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3255 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3256 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3257 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3258 Type *CType = Builder.getIntNTy(TySize * 8);
3259
3260 unsigned NumIters = RealTySize / TySize;
3261 if (NumIters == 0)
3262 continue;
3263 Value *Cnt = nullptr;
3264 Value *CntAddr = nullptr;
3265 BasicBlock *PrecondBB = nullptr;
3266 BasicBlock *ExitBB = nullptr;
3267 if (NumIters > 1) {
3268 CodeGenIP = Builder.saveIP();
3269 Builder.restoreIP(AllocaIP);
3270 CntAddr =
3271 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3272
3273 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3274 CntAddr->getName() + ".ascast");
3275 Builder.restoreIP(CodeGenIP);
3276 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3277 CntAddr,
3278 /*Volatile=*/false);
3279 PrecondBB = BasicBlock::Create(Ctx, "precond");
3280 ExitBB = BasicBlock::Create(Ctx, "exit");
3281 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3282 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3283 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3284 /*Volatile=*/false);
3285 Value *Cmp = Builder.CreateICmpULT(
3286 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3287 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3288 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3289 }
3290
3291 // kmpc_barrier.
3292 InsertPointOrErrorTy BarrierIP1 =
3293 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3294 omp::Directive::OMPD_unknown,
3295 /* ForceSimpleCall */ false,
3296 /* CheckCancelFlag */ true);
3297 if (!BarrierIP1)
3298 return BarrierIP1.takeError();
3299 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3300 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3301 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3302
3303 // if (lane_id == 0)
3304 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3305 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3306 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3307
3308 // Reduce element = LocalReduceList[i]
3309 auto *RedListArrayTy =
3310 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3311 Type *IndexTy = Builder.getIndexTy(
3312 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3313 Value *ElemPtrPtr =
3314 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3315 {ConstantInt::get(IndexTy, 0),
3316 ConstantInt::get(IndexTy, En.index())});
3317 // elemptr = ((CopyType*)(elemptrptr)) + I
3318 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3319
3320 if (IsByRefElem) {
3321 InsertPointOrErrorTy GenRes =
3322 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3323
3324 if (!GenRes)
3325 return GenRes.takeError();
3326
3327 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3328 }
3329
3330 if (NumIters > 1)
3331 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3332
3333 // Get pointer to location in transfer medium.
3334 // MediumPtr = &medium[warp_id]
3335 Value *MediumPtr = Builder.CreateInBoundsGEP(
3336 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3337 // elem = *elemptr
3338 //*MediumPtr = elem
3339 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3340 // Store the source element value to the dest element address.
3341 Builder.CreateStore(Elem, MediumPtr,
3342 /*IsVolatile*/ true);
3343 Builder.CreateBr(MergeBB);
3344
3345 // else
3346 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3347 Builder.CreateBr(MergeBB);
3348
3349 // endif
3350 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3351 InsertPointOrErrorTy BarrierIP2 =
3352 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3353 omp::Directive::OMPD_unknown,
3354 /* ForceSimpleCall */ false,
3355 /* CheckCancelFlag */ true);
3356 if (!BarrierIP2)
3357 return BarrierIP2.takeError();
3358
3359 // Warp 0 copies reduce element from transfer medium
3360 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3361 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3362 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3363
3364 Value *NumWarpsVal =
3365 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3366 // Up to 32 threads in warp 0 are active.
3367 Value *IsActiveThread =
3368 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3369 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3370
3371 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3372
3373 // SecMediumPtr = &medium[tid]
3374 // SrcMediumVal = *SrcMediumPtr
3375 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3376 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3377 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3378 Value *TargetElemPtrPtr =
3379 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3380 {ConstantInt::get(IndexTy, 0),
3381 ConstantInt::get(IndexTy, En.index())});
3382 Value *TargetElemPtrVal =
3383 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3384 Value *TargetElemPtr = TargetElemPtrVal;
3385
3386 if (IsByRefElem) {
3387 InsertPointOrErrorTy GenRes =
3388 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3389
3390 if (!GenRes)
3391 return GenRes.takeError();
3392
3393 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3394 }
3395
3396 if (NumIters > 1)
3397 TargetElemPtr =
3398 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3399
3400 // *TargetElemPtr = SrcMediumVal;
3401 Value *SrcMediumValue =
3402 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3403 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3404 Builder.CreateBr(W0MergeBB);
3405
3406 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3407 Builder.CreateBr(W0MergeBB);
3408
3409 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3410
3411 if (NumIters > 1) {
3412 Cnt = Builder.CreateNSWAdd(
3413 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3414 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3415
3416 auto *CurFn = Builder.GetInsertBlock()->getParent();
3417 emitBranch(PrecondBB);
3418 emitBlock(ExitBB, CurFn);
3419 }
3420 RealTySize %= TySize;
3421 }
3422 }
3423
3424 Builder.CreateRetVoid();
3425 Builder.restoreIP(SavedIP);
3426
3427 return WcFunc;
3428}
3429
3430Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3431 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3432 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3433 LLVMContext &Ctx = M.getContext();
3434 FunctionType *FuncTy =
3435 FunctionType::get(Builder.getVoidTy(),
3436 {Builder.getPtrTy(), Builder.getInt16Ty(),
3437 Builder.getInt16Ty(), Builder.getInt16Ty()},
3438 /* IsVarArg */ false);
3439 Function *SarFunc =
3441 "_omp_reduction_shuffle_and_reduce_func", &M);
3442 SarFunc->setAttributes(FuncAttrs);
3443 SarFunc->addParamAttr(0, Attribute::NoUndef);
3444 SarFunc->addParamAttr(1, Attribute::NoUndef);
3445 SarFunc->addParamAttr(2, Attribute::NoUndef);
3446 SarFunc->addParamAttr(3, Attribute::NoUndef);
3447 SarFunc->addParamAttr(1, Attribute::SExt);
3448 SarFunc->addParamAttr(2, Attribute::SExt);
3449 SarFunc->addParamAttr(3, Attribute::SExt);
3450 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3451 Builder.SetInsertPoint(EntryBB);
3452
3453 // Thread local Reduce list used to host the values of data to be reduced.
3454 Argument *ReduceListArg = SarFunc->getArg(0);
3455 // Current lane id; could be logical.
3456 Argument *LaneIDArg = SarFunc->getArg(1);
3457 // Offset of the remote source lane relative to the current lane.
3458 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3459 // Algorithm version. This is expected to be known at compile time.
3460 Argument *AlgoVerArg = SarFunc->getArg(3);
3461
3462 Type *ReduceListArgType = ReduceListArg->getType();
3463 Type *LaneIDArgType = LaneIDArg->getType();
3464 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3465 Value *ReduceListAlloca = Builder.CreateAlloca(
3466 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3467 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3468 LaneIDArg->getName() + ".addr");
3469 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3470 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3471 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3472 AlgoVerArg->getName() + ".addr");
3473 ArrayType *RedListArrayTy =
3474 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3475
3476 // Create a local thread-private variable to host the Reduce list
3477 // from a remote lane.
3478 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3479 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3480
3481 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3482 ReduceListAlloca, ReduceListArgType,
3483 ReduceListAlloca->getName() + ".ascast");
3484 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3485 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3486 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3488 RemoteLaneOffsetAlloca->getName() + ".ascast");
3489 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3490 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3491 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3492 RemoteReductionListAlloca, Builder.getPtrTy(),
3493 RemoteReductionListAlloca->getName() + ".ascast");
3494
3495 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3496 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3497 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3498 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3499
3500 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3501 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3502 Value *RemoteLaneOffset =
3503 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3504 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3505
3506 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3507
3508 // This loop iterates through the list of reduce elements and copies,
3509 // element by element, from a remote lane in the warp to RemoteReduceList,
3510 // hosted on the thread's stack.
3511 Error EmitRedLsCpRes = emitReductionListCopy(
3512 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3513 ReduceList, RemoteListAddrCast, IsByRef,
3514 {RemoteLaneOffset, nullptr, nullptr});
3515
3516 if (EmitRedLsCpRes)
3517 return EmitRedLsCpRes;
3518
3519 // The actions to be performed on the Remote Reduce list is dependent
3520 // on the algorithm version.
3521 //
3522 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3523 // LaneId % 2 == 0 && Offset > 0):
3524 // do the reduction value aggregation
3525 //
3526 // The thread local variable Reduce list is mutated in place to host the
3527 // reduced data, which is the aggregated value produced from local and
3528 // remote lanes.
3529 //
3530 // Note that AlgoVer is expected to be a constant integer known at compile
3531 // time.
3532 // When AlgoVer==0, the first conjunction evaluates to true, making
3533 // the entire predicate true during compile time.
3534 // When AlgoVer==1, the second conjunction has only the second part to be
3535 // evaluated during runtime. Other conjunctions evaluates to false
3536 // during compile time.
3537 // When AlgoVer==2, the third conjunction has only the second part to be
3538 // evaluated during runtime. Other conjunctions evaluates to false
3539 // during compile time.
3540 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3541 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3542 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3543 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3544 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3545 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3546 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3547 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3548 Value *RemoteOffsetComp =
3549 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3550 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3551 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3552 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3553
3554 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3555 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3556 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3557
3558 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3559 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3560 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3561 ReduceList, Builder.getPtrTy());
3562 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3563 RemoteListAddrCast, Builder.getPtrTy());
3564 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3565 ->addFnAttr(Attribute::NoUnwind);
3566 Builder.CreateBr(MergeBB);
3567
3568 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3569 Builder.CreateBr(MergeBB);
3570
3571 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3572
3573 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3574 // Reduce list.
3575 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3576 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3577 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3578
3579 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3580 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3581 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3582 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3583
3584 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3585
3586 EmitRedLsCpRes = emitReductionListCopy(
3587 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3588 RemoteListAddrCast, ReduceList, IsByRef);
3589
3590 if (EmitRedLsCpRes)
3591 return EmitRedLsCpRes;
3592
3593 Builder.CreateBr(CpyMergeBB);
3594
3595 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3596 Builder.CreateBr(CpyMergeBB);
3597
3598 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3599
3600 Builder.CreateRetVoid();
3601
3602 return SarFunc;
3603}
3604
3606OpenMPIRBuilder::generateReductionDescriptor(
3607 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3608 Type *DescriptorType,
3609 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3610 DataPtrPtrGen) {
3611
3612 // Copy the source descriptor to preserve all metadata (rank, extents,
3613 // strides, etc.)
3614 Value *DescriptorSize =
3615 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3616 Builder.CreateMemCpy(
3617 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3618 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3619 DescriptorSize);
3620
3621 // Update the base pointer field to point to the local shuffled data
3622 Value *DataPtrField;
3623 InsertPointOrErrorTy GenResult =
3624 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3625
3626 if (!GenResult)
3627 return GenResult.takeError();
3628
3629 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3630 DataPtr, Builder.getPtrTy(), ".ascast"),
3631 DataPtrField);
3632
3633 return Builder.saveIP();
3634}
3635
3636Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3637 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3638 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3639 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3640 LLVMContext &Ctx = M.getContext();
3641 FunctionType *FuncTy = FunctionType::get(
3642 Builder.getVoidTy(),
3643 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3644 /* IsVarArg */ false);
3645 Function *LtGCFunc =
3647 "_omp_reduction_list_to_global_copy_func", &M);
3648 LtGCFunc->setAttributes(FuncAttrs);
3649 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3650 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3651 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3652
3653 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3654 Builder.SetInsertPoint(EntryBlock);
3655
3656 // Buffer: global reduction buffer.
3657 Argument *BufferArg = LtGCFunc->getArg(0);
3658 // Idx: index of the buffer.
3659 Argument *IdxArg = LtGCFunc->getArg(1);
3660 // ReduceList: thread local Reduce list.
3661 Argument *ReduceListArg = LtGCFunc->getArg(2);
3662
3663 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3664 BufferArg->getName() + ".addr");
3665 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3666 IdxArg->getName() + ".addr");
3667 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3668 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3669 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3670 BufferArgAlloca, Builder.getPtrTy(),
3671 BufferArgAlloca->getName() + ".ascast");
3672 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3673 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3674 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3675 ReduceListArgAlloca, Builder.getPtrTy(),
3676 ReduceListArgAlloca->getName() + ".ascast");
3677
3678 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3679 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3680 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3681
3682 Value *LocalReduceList =
3683 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3684 Value *BufferArgVal =
3685 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3686 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3687 Type *IndexTy = Builder.getIndexTy(
3688 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3689 for (auto En : enumerate(ReductionInfos)) {
3690 const ReductionInfo &RI = En.value();
3691 auto *RedListArrayTy =
3692 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3693 // Reduce element = LocalReduceList[i]
3694 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3695 RedListArrayTy, LocalReduceList,
3696 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3697 // elemptr = ((CopyType*)(elemptrptr)) + I
3698 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3699
3700 // Global = Buffer.VD[Idx];
3701 Value *BufferVD =
3702 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3703 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3704 ReductionsBufferTy, BufferVD, 0, En.index());
3705
3706 switch (RI.EvaluationKind) {
3707 case EvalKind::Scalar: {
3708 Value *TargetElement;
3709
3710 if (IsByRef.empty() || !IsByRef[En.index()]) {
3711 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3712 } else {
3713 InsertPointOrErrorTy GenResult =
3714 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3715
3716 if (!GenResult)
3717 return GenResult.takeError();
3718
3719 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3720 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3721 }
3722
3723 Builder.CreateStore(TargetElement, GlobVal);
3724 break;
3725 }
3726 case EvalKind::Complex: {
3727 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3728 RI.ElementType, ElemPtr, 0, 0, ".realp");
3729 Value *SrcReal = Builder.CreateLoad(
3730 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3731 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3732 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3733 Value *SrcImg = Builder.CreateLoad(
3734 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3735
3736 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3737 RI.ElementType, GlobVal, 0, 0, ".realp");
3738 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3739 RI.ElementType, GlobVal, 0, 1, ".imagp");
3740 Builder.CreateStore(SrcReal, DestRealPtr);
3741 Builder.CreateStore(SrcImg, DestImgPtr);
3742 break;
3743 }
3744 case EvalKind::Aggregate: {
3745 Value *SizeVal =
3746 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3747 Builder.CreateMemCpy(
3748 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3749 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3750 break;
3751 }
3752 }
3753 }
3754
3755 Builder.CreateRetVoid();
3756 Builder.restoreIP(OldIP);
3757 return LtGCFunc;
3758}
3759
3760Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3761 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3762 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3763 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3764 LLVMContext &Ctx = M.getContext();
3765 FunctionType *FuncTy = FunctionType::get(
3766 Builder.getVoidTy(),
3767 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3768 /* IsVarArg */ false);
3769 Function *LtGRFunc =
3771 "_omp_reduction_list_to_global_reduce_func", &M);
3772 LtGRFunc->setAttributes(FuncAttrs);
3773 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3774 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3775 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3776
3777 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3778 Builder.SetInsertPoint(EntryBlock);
3779
3780 // Buffer: global reduction buffer.
3781 Argument *BufferArg = LtGRFunc->getArg(0);
3782 // Idx: index of the buffer.
3783 Argument *IdxArg = LtGRFunc->getArg(1);
3784 // ReduceList: thread local Reduce list.
3785 Argument *ReduceListArg = LtGRFunc->getArg(2);
3786
3787 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3788 BufferArg->getName() + ".addr");
3789 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3790 IdxArg->getName() + ".addr");
3791 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3792 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3793 auto *RedListArrayTy =
3794 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3795
3796 // 1. Build a list of reduction variables.
3797 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3798 Value *LocalReduceList =
3799 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3800
3801 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3802
3803 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3804 BufferArgAlloca, Builder.getPtrTy(),
3805 BufferArgAlloca->getName() + ".ascast");
3806 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3807 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3808 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3809 ReduceListArgAlloca, Builder.getPtrTy(),
3810 ReduceListArgAlloca->getName() + ".ascast");
3811 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3812 LocalReduceList, Builder.getPtrTy(),
3813 LocalReduceList->getName() + ".ascast");
3814
3815 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3816 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3817 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3818
3819 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3820 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3821 Type *IndexTy = Builder.getIndexTy(
3822 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3823 for (auto En : enumerate(ReductionInfos)) {
3824 const ReductionInfo &RI = En.value();
3825 Value *ByRefAlloc;
3826
3827 if (!IsByRef.empty() && IsByRef[En.index()]) {
3828 InsertPointTy OldIP = Builder.saveIP();
3829 Builder.restoreIP(AllocaIP);
3830
3831 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3832 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3833 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3834
3835 Builder.restoreIP(OldIP);
3836 }
3837
3838 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3839 RedListArrayTy, LocalReduceListAddrCast,
3840 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3841 Value *BufferVD =
3842 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3843 // Global = Buffer.VD[Idx];
3844 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3845 ReductionsBufferTy, BufferVD, 0, En.index());
3846
3847 if (!IsByRef.empty() && IsByRef[En.index()]) {
3848 // Get source descriptor from the reduce list argument
3849 Value *ReduceList =
3850 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3851 Value *SrcElementPtrPtr =
3852 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3853 {ConstantInt::get(IndexTy, 0),
3854 ConstantInt::get(IndexTy, En.index())});
3855 Value *SrcDescriptorAddr =
3856 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3857
3858 // Copy descriptor from source and update base_ptr to global buffer data
3859 InsertPointOrErrorTy GenResult =
3860 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3861 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3862
3863 if (!GenResult)
3864 return GenResult.takeError();
3865
3866 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3867 } else {
3868 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3869 }
3870 }
3871
3872 // Call reduce_function(GlobalReduceList, ReduceList)
3873 Value *ReduceList =
3874 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3875 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3876 ->addFnAttr(Attribute::NoUnwind);
3877 Builder.CreateRetVoid();
3878 Builder.restoreIP(OldIP);
3879 return LtGRFunc;
3880}
3881
3882Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3883 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3884 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3885 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3886 LLVMContext &Ctx = M.getContext();
3887 FunctionType *FuncTy = FunctionType::get(
3888 Builder.getVoidTy(),
3889 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3890 /* IsVarArg */ false);
3891 Function *GtLCFunc =
3893 "_omp_reduction_global_to_list_copy_func", &M);
3894 GtLCFunc->setAttributes(FuncAttrs);
3895 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3896 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3897 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3898
3899 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3900 Builder.SetInsertPoint(EntryBlock);
3901
3902 // Buffer: global reduction buffer.
3903 Argument *BufferArg = GtLCFunc->getArg(0);
3904 // Idx: index of the buffer.
3905 Argument *IdxArg = GtLCFunc->getArg(1);
3906 // ReduceList: thread local Reduce list.
3907 Argument *ReduceListArg = GtLCFunc->getArg(2);
3908
3909 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3910 BufferArg->getName() + ".addr");
3911 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3912 IdxArg->getName() + ".addr");
3913 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3914 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3915 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3916 BufferArgAlloca, Builder.getPtrTy(),
3917 BufferArgAlloca->getName() + ".ascast");
3918 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3919 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3920 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3921 ReduceListArgAlloca, Builder.getPtrTy(),
3922 ReduceListArgAlloca->getName() + ".ascast");
3923 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3924 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3925 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3926
3927 Value *LocalReduceList =
3928 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3929 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3930 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3931 Type *IndexTy = Builder.getIndexTy(
3932 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3933 for (auto En : enumerate(ReductionInfos)) {
3934 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3935 auto *RedListArrayTy =
3936 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3937 // Reduce element = LocalReduceList[i]
3938 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3939 RedListArrayTy, LocalReduceList,
3940 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3941 // elemptr = ((CopyType*)(elemptrptr)) + I
3942 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3943 // Global = Buffer.VD[Idx];
3944 Value *BufferVD =
3945 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3946 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3947 ReductionsBufferTy, BufferVD, 0, En.index());
3948
3949 switch (RI.EvaluationKind) {
3950 case EvalKind::Scalar: {
3951 Type *ElemType = RI.ElementType;
3952
3953 if (!IsByRef.empty() && IsByRef[En.index()]) {
3954 ElemType = RI.ByRefElementType;
3955 InsertPointOrErrorTy GenResult =
3956 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3957
3958 if (!GenResult)
3959 return GenResult.takeError();
3960
3961 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3962 }
3963
3964 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3965 Builder.CreateStore(TargetElement, ElemPtr);
3966 break;
3967 }
3968 case EvalKind::Complex: {
3969 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3970 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3971 Value *SrcReal = Builder.CreateLoad(
3972 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3973 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3974 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3975 Value *SrcImg = Builder.CreateLoad(
3976 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3977
3978 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3979 RI.ElementType, ElemPtr, 0, 0, ".realp");
3980 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3981 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3982 Builder.CreateStore(SrcReal, DestRealPtr);
3983 Builder.CreateStore(SrcImg, DestImgPtr);
3984 break;
3985 }
3986 case EvalKind::Aggregate: {
3987 Value *SizeVal =
3988 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3989 Builder.CreateMemCpy(
3990 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3991 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3992 SizeVal, false);
3993 break;
3994 }
3995 }
3996 }
3997
3998 Builder.CreateRetVoid();
3999 Builder.restoreIP(OldIP);
4000 return GtLCFunc;
4001}
4002
4003Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4004 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4005 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4006 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4007 LLVMContext &Ctx = M.getContext();
4008 auto *FuncTy = FunctionType::get(
4009 Builder.getVoidTy(),
4010 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4011 /* IsVarArg */ false);
4012 Function *GtLRFunc =
4014 "_omp_reduction_global_to_list_reduce_func", &M);
4015 GtLRFunc->setAttributes(FuncAttrs);
4016 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4017 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4018 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4019
4020 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4021 Builder.SetInsertPoint(EntryBlock);
4022
4023 // Buffer: global reduction buffer.
4024 Argument *BufferArg = GtLRFunc->getArg(0);
4025 // Idx: index of the buffer.
4026 Argument *IdxArg = GtLRFunc->getArg(1);
4027 // ReduceList: thread local Reduce list.
4028 Argument *ReduceListArg = GtLRFunc->getArg(2);
4029
4030 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4031 BufferArg->getName() + ".addr");
4032 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4033 IdxArg->getName() + ".addr");
4034 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4035 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4036 ArrayType *RedListArrayTy =
4037 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4038
4039 // 1. Build a list of reduction variables.
4040 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4041 Value *LocalReduceList =
4042 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4043
4044 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4045
4046 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4047 BufferArgAlloca, Builder.getPtrTy(),
4048 BufferArgAlloca->getName() + ".ascast");
4049 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4050 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4051 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4052 ReduceListArgAlloca, Builder.getPtrTy(),
4053 ReduceListArgAlloca->getName() + ".ascast");
4054 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4055 LocalReduceList, Builder.getPtrTy(),
4056 LocalReduceList->getName() + ".ascast");
4057
4058 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4059 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4060 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4061
4062 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4063 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4064 Type *IndexTy = Builder.getIndexTy(
4065 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4066 for (auto En : enumerate(ReductionInfos)) {
4067 const ReductionInfo &RI = En.value();
4068 Value *ByRefAlloc;
4069
4070 if (!IsByRef.empty() && IsByRef[En.index()]) {
4071 InsertPointTy OldIP = Builder.saveIP();
4072 Builder.restoreIP(AllocaIP);
4073
4074 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4075 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4076 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4077
4078 Builder.restoreIP(OldIP);
4079 }
4080
4081 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4082 RedListArrayTy, ReductionList,
4083 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4084 // Global = Buffer.VD[Idx];
4085 Value *BufferVD =
4086 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4087 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4088 ReductionsBufferTy, BufferVD, 0, En.index());
4089
4090 if (!IsByRef.empty() && IsByRef[En.index()]) {
4091 // Get source descriptor from the reduce list
4092 Value *ReduceListVal =
4093 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4094 Value *SrcElementPtrPtr =
4095 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4096 {ConstantInt::get(IndexTy, 0),
4097 ConstantInt::get(IndexTy, En.index())});
4098 Value *SrcDescriptorAddr =
4099 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4100
4101 // Copy descriptor from source and update base_ptr to global buffer data
4102 InsertPointOrErrorTy GenResult =
4103 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4104 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4105 if (!GenResult)
4106 return GenResult.takeError();
4107
4108 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4109 } else {
4110 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4111 }
4112 }
4113
4114 // Call reduce_function(ReduceList, GlobalReduceList)
4115 Value *ReduceList =
4116 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4117 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4118 ->addFnAttr(Attribute::NoUnwind);
4119 Builder.CreateRetVoid();
4120 Builder.restoreIP(OldIP);
4121 return GtLRFunc;
4122}
4123
4124std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4125 std::string Suffix =
4126 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4127 return (Name + Suffix).str();
4128}
4129
4130Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4131 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4133 AttributeList FuncAttrs) {
4134 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4135 {Builder.getPtrTy(), Builder.getPtrTy()},
4136 /* IsVarArg */ false);
4137 std::string Name = getReductionFuncName(ReducerName);
4138 Function *ReductionFunc =
4140 ReductionFunc->setAttributes(FuncAttrs);
4141 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4142 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4143 BasicBlock *EntryBB =
4144 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4145 Builder.SetInsertPoint(EntryBB);
4146
4147 // Need to alloca memory here and deal with the pointers before getting
4148 // LHS/RHS pointers out
4149 Value *LHSArrayPtr = nullptr;
4150 Value *RHSArrayPtr = nullptr;
4151 Argument *Arg0 = ReductionFunc->getArg(0);
4152 Argument *Arg1 = ReductionFunc->getArg(1);
4153 Type *Arg0Type = Arg0->getType();
4154 Type *Arg1Type = Arg1->getType();
4155
4156 Value *LHSAlloca =
4157 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4158 Value *RHSAlloca =
4159 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4160 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4161 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4162 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4163 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4164 Builder.CreateStore(Arg0, LHSAddrCast);
4165 Builder.CreateStore(Arg1, RHSAddrCast);
4166 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4167 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4168
4169 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4170 Type *IndexTy = Builder.getIndexTy(
4171 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4172 SmallVector<Value *> LHSPtrs, RHSPtrs;
4173 for (auto En : enumerate(ReductionInfos)) {
4174 const ReductionInfo &RI = En.value();
4175 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4176 RedArrayTy, RHSArrayPtr,
4177 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4178 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4179 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4180 RHSI8Ptr, RI.PrivateVariable->getType(),
4181 RHSI8Ptr->getName() + ".ascast");
4182
4183 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4184 RedArrayTy, LHSArrayPtr,
4185 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4186 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4187 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4188 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4189
4191 LHSPtrs.emplace_back(LHSPtr);
4192 RHSPtrs.emplace_back(RHSPtr);
4193 } else {
4194 Value *LHS = LHSPtr;
4195 Value *RHS = RHSPtr;
4196
4197 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4198 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4199 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4200 }
4201
4202 Value *Reduced;
4203 InsertPointOrErrorTy AfterIP =
4204 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4205 if (!AfterIP)
4206 return AfterIP.takeError();
4207 if (!Builder.GetInsertBlock())
4208 return ReductionFunc;
4209
4210 Builder.restoreIP(*AfterIP);
4211
4212 if (!IsByRef.empty() && !IsByRef[En.index()])
4213 Builder.CreateStore(Reduced, LHSPtr);
4214 }
4215 }
4216
4218 for (auto En : enumerate(ReductionInfos)) {
4219 unsigned Index = En.index();
4220 const ReductionInfo &RI = En.value();
4221 Value *LHSFixupPtr, *RHSFixupPtr;
4222 Builder.restoreIP(RI.ReductionGenClang(
4223 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4224
4225 // Fix the CallBack code genereated to use the correct Values for the LHS
4226 // and RHS
4227 LHSFixupPtr->replaceUsesWithIf(
4228 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4229 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4230 ReductionFunc;
4231 });
4232 RHSFixupPtr->replaceUsesWithIf(
4233 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4234 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4235 ReductionFunc;
4236 });
4237 }
4238
4239 Builder.CreateRetVoid();
4240 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4241 // to the entry block (this is dones for higher opt levels by later passes in
4242 // the pipeline). This has caused issues because non-entry `alloca`s force the
4243 // function to use dynamic stack allocations and we might run out of scratch
4244 // memory.
4245 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4246
4247 return ReductionFunc;
4248}
4249
4250static void
4252 bool IsGPU) {
4253 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4254 (void)RI;
4255 assert(RI.Variable && "expected non-null variable");
4256 assert(RI.PrivateVariable && "expected non-null private variable");
4257 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4258 "expected non-null reduction generator callback");
4259 if (!IsGPU) {
4260 assert(
4261 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4262 "expected variables and their private equivalents to have the same "
4263 "type");
4264 }
4265 assert(RI.Variable->getType()->isPointerTy() &&
4266 "expected variables to be pointers");
4267 }
4268}
4269
4271 const LocationDescription &Loc, InsertPointTy AllocaIP,
4272 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4273 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4274 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4275 unsigned ReductionBufNum, Value *SrcLocInfo) {
4276 if (!updateToLocation(Loc))
4277 return InsertPointTy();
4278 Builder.restoreIP(CodeGenIP);
4279 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4280 LLVMContext &Ctx = M.getContext();
4281
4282 // Source location for the ident struct
4283 if (!SrcLocInfo) {
4284 uint32_t SrcLocStrSize;
4285 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4286 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4287 }
4288
4289 if (ReductionInfos.size() == 0)
4290 return Builder.saveIP();
4291
4292 BasicBlock *ContinuationBlock = nullptr;
4294 // Copied code from createReductions
4295 BasicBlock *InsertBlock = Loc.IP.getBlock();
4296 ContinuationBlock =
4297 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4298 InsertBlock->getTerminator()->eraseFromParent();
4299 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4300 }
4301
4302 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4303 AttributeList FuncAttrs;
4304 AttrBuilder AttrBldr(Ctx);
4305 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4306 AttrBldr.addAttribute(Attr);
4307 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4308 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4309
4310 CodeGenIP = Builder.saveIP();
4311 Expected<Function *> ReductionResult = createReductionFunction(
4312 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4313 ReductionGenCBKind, FuncAttrs);
4314 if (!ReductionResult)
4315 return ReductionResult.takeError();
4316 Function *ReductionFunc = *ReductionResult;
4317 Builder.restoreIP(CodeGenIP);
4318
4319 // Set the grid value in the config needed for lowering later on
4320 if (GridValue.has_value())
4321 Config.setGridValue(GridValue.value());
4322 else
4323 Config.setGridValue(getGridValue(T, ReductionFunc));
4324
4325 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4326 // RedList, shuffle_reduce_func, interwarp_copy_func);
4327 // or
4328 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4329 Value *Res;
4330
4331 // 1. Build a list of reduction variables.
4332 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4333 auto Size = ReductionInfos.size();
4334 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4335 Type *FuncPtrTy =
4336 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4337 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4338 CodeGenIP = Builder.saveIP();
4339 Builder.restoreIP(AllocaIP);
4340 Value *ReductionListAlloca =
4341 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4342 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4343 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4344 Builder.restoreIP(CodeGenIP);
4345 Type *IndexTy = Builder.getIndexTy(
4346 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4347 for (auto En : enumerate(ReductionInfos)) {
4348 const ReductionInfo &RI = En.value();
4349 Value *ElemPtr = Builder.CreateInBoundsGEP(
4350 RedArrayTy, ReductionList,
4351 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4352
4353 Value *PrivateVar = RI.PrivateVariable;
4354 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4355 if (IsByRefElem)
4356 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4357
4358 Value *CastElem =
4359 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4360 Builder.CreateStore(CastElem, ElemPtr);
4361 }
4362 CodeGenIP = Builder.saveIP();
4363 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4364 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4365
4366 if (!SarFunc)
4367 return SarFunc.takeError();
4368
4369 Expected<Function *> CopyResult =
4370 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4371 if (!CopyResult)
4372 return CopyResult.takeError();
4373 Function *WcFunc = *CopyResult;
4374 Builder.restoreIP(CodeGenIP);
4375
4376 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4377
4378 unsigned MaxDataSize = 0;
4379 SmallVector<Type *> ReductionTypeArgs;
4380 for (auto En : enumerate(ReductionInfos)) {
4381 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
4382 if (Size > MaxDataSize)
4383 MaxDataSize = Size;
4384 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4385 ? En.value().ByRefElementType
4386 : En.value().ElementType;
4387 ReductionTypeArgs.emplace_back(RedTypeArg);
4388 }
4389 Value *ReductionDataSize =
4390 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4391 if (!IsTeamsReduction) {
4392 Value *SarFuncCast =
4393 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4394 Value *WcFuncCast =
4395 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4396 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4397 WcFuncCast};
4399 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4400 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4401 } else {
4402 CodeGenIP = Builder.saveIP();
4403 StructType *ReductionsBufferTy = StructType::create(
4404 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4405 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4406 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4407
4408 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4409 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4410 if (!LtGCFunc)
4411 return LtGCFunc.takeError();
4412
4413 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4414 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4415 if (!LtGRFunc)
4416 return LtGRFunc.takeError();
4417
4418 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4419 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4420 if (!GtLCFunc)
4421 return GtLCFunc.takeError();
4422
4423 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4424 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4425 if (!GtLRFunc)
4426 return GtLRFunc.takeError();
4427
4428 Builder.restoreIP(CodeGenIP);
4429
4430 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4431 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4432
4433 Value *Args3[] = {SrcLocInfo,
4434 KernelTeamsReductionPtr,
4435 Builder.getInt32(ReductionBufNum),
4436 ReductionDataSize,
4437 RL,
4438 *SarFunc,
4439 WcFunc,
4440 *LtGCFunc,
4441 *LtGRFunc,
4442 *GtLCFunc,
4443 *GtLRFunc};
4444
4445 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4446 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4447 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4448 }
4449
4450 // 5. Build if (res == 1)
4451 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4452 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4453 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4454 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4455
4456 // 6. Build then branch: where we have reduced values in the master
4457 // thread in each team.
4458 // __kmpc_end_reduce{_nowait}(<gtid>);
4459 // break;
4460 emitBlock(ThenBB, CurFunc);
4461
4462 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4463 for (auto En : enumerate(ReductionInfos)) {
4464 const ReductionInfo &RI = En.value();
4466 Value *RedValue = RI.Variable;
4467 Value *RHS =
4468 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4469
4471 Value *LHSPtr, *RHSPtr;
4472 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4473 &LHSPtr, &RHSPtr, CurFunc));
4474
4475 // Fix the CallBack code genereated to use the correct Values for the LHS
4476 // and RHS
4477 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4478 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4479 ReductionFunc;
4480 });
4481 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4482 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4483 ReductionFunc;
4484 });
4485 } else {
4486 if (IsByRef.empty() || !IsByRef[En.index()]) {
4487 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4488 "red.value." + Twine(En.index()));
4489 }
4490 Value *PrivateRedValue = Builder.CreateLoad(
4491 ValueType, RHS, "red.private.value" + Twine(En.index()));
4492 Value *Reduced;
4493 InsertPointOrErrorTy AfterIP =
4494 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4495 if (!AfterIP)
4496 return AfterIP.takeError();
4497 Builder.restoreIP(*AfterIP);
4498
4499 if (!IsByRef.empty() && !IsByRef[En.index()])
4500 Builder.CreateStore(Reduced, RI.Variable);
4501 }
4502 }
4503 emitBlock(ExitBB, CurFunc);
4504 if (ContinuationBlock) {
4505 Builder.CreateBr(ContinuationBlock);
4506 Builder.SetInsertPoint(ContinuationBlock);
4507 }
4508 Config.setEmitLLVMUsed();
4509
4510 return Builder.saveIP();
4511}
4512
4514 Type *VoidTy = Type::getVoidTy(M.getContext());
4515 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4516 auto *FuncTy =
4517 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4519 ".omp.reduction.func", &M);
4520}
4521
4523 Function *ReductionFunc,
4525 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4526 Module *Module = ReductionFunc->getParent();
4527 BasicBlock *ReductionFuncBlock =
4528 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4529 Builder.SetInsertPoint(ReductionFuncBlock);
4530 Value *LHSArrayPtr = nullptr;
4531 Value *RHSArrayPtr = nullptr;
4532 if (IsGPU) {
4533 // Need to alloca memory here and deal with the pointers before getting
4534 // LHS/RHS pointers out
4535 //
4536 Argument *Arg0 = ReductionFunc->getArg(0);
4537 Argument *Arg1 = ReductionFunc->getArg(1);
4538 Type *Arg0Type = Arg0->getType();
4539 Type *Arg1Type = Arg1->getType();
4540
4541 Value *LHSAlloca =
4542 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4543 Value *RHSAlloca =
4544 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4545 Value *LHSAddrCast =
4546 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4547 Value *RHSAddrCast =
4548 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4549 Builder.CreateStore(Arg0, LHSAddrCast);
4550 Builder.CreateStore(Arg1, RHSAddrCast);
4551 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4552 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4553 } else {
4554 LHSArrayPtr = ReductionFunc->getArg(0);
4555 RHSArrayPtr = ReductionFunc->getArg(1);
4556 }
4557
4558 unsigned NumReductions = ReductionInfos.size();
4559 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4560
4561 for (auto En : enumerate(ReductionInfos)) {
4562 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4563 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4564 RedArrayTy, LHSArrayPtr, 0, En.index());
4565 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4566 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4567 LHSI8Ptr, RI.Variable->getType());
4568 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4569 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4570 RedArrayTy, RHSArrayPtr, 0, En.index());
4571 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4572 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4573 RHSI8Ptr, RI.PrivateVariable->getType());
4574 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4575 Value *Reduced;
4577 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4578 if (!AfterIP)
4579 return AfterIP.takeError();
4580
4581 Builder.restoreIP(*AfterIP);
4582 // TODO: Consider flagging an error.
4583 if (!Builder.GetInsertBlock())
4584 return Error::success();
4585
4586 // store is inside of the reduction region when using by-ref
4587 if (!IsByRef[En.index()])
4588 Builder.CreateStore(Reduced, LHSPtr);
4589 }
4590 Builder.CreateRetVoid();
4591 return Error::success();
4592}
4593
4595 const LocationDescription &Loc, InsertPointTy AllocaIP,
4596 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4597 bool IsNoWait, bool IsTeamsReduction) {
4598 assert(ReductionInfos.size() == IsByRef.size());
4599 if (Config.isGPU())
4600 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4601 IsByRef, IsNoWait, IsTeamsReduction);
4602
4603 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4604
4605 if (!updateToLocation(Loc))
4606 return InsertPointTy();
4607
4608 if (ReductionInfos.size() == 0)
4609 return Builder.saveIP();
4610
4611 BasicBlock *InsertBlock = Loc.IP.getBlock();
4612 BasicBlock *ContinuationBlock =
4613 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4614 InsertBlock->getTerminator()->eraseFromParent();
4615
4616 // Create and populate array of type-erased pointers to private reduction
4617 // values.
4618 unsigned NumReductions = ReductionInfos.size();
4619 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4620 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4621 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4622
4623 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4624
4625 for (auto En : enumerate(ReductionInfos)) {
4626 unsigned Index = En.index();
4627 const ReductionInfo &RI = En.value();
4628 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4629 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4630 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4631 }
4632
4633 // Emit a call to the runtime function that orchestrates the reduction.
4634 // Declare the reduction function in the process.
4635 Type *IndexTy = Builder.getIndexTy(
4636 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4637 Function *Func = Builder.GetInsertBlock()->getParent();
4638 Module *Module = Func->getParent();
4639 uint32_t SrcLocStrSize;
4640 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4641 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4642 return RI.AtomicReductionGen;
4643 });
4644 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4645 CanGenerateAtomic
4646 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4647 : IdentFlag(0));
4648 Value *ThreadId = getOrCreateThreadID(Ident);
4649 Constant *NumVariables = Builder.getInt32(NumReductions);
4650 const DataLayout &DL = Module->getDataLayout();
4651 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4652 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4653 Function *ReductionFunc = getFreshReductionFunc(*Module);
4654 Value *Lock = getOMPCriticalRegionLock(".reduction");
4656 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4657 : RuntimeFunction::OMPRTL___kmpc_reduce);
4658 CallInst *ReduceCall =
4659 createRuntimeFunctionCall(ReduceFunc,
4660 {Ident, ThreadId, NumVariables, RedArraySize,
4661 RedArray, ReductionFunc, Lock},
4662 "reduce");
4663
4664 // Create final reduction entry blocks for the atomic and non-atomic case.
4665 // Emit IR that dispatches control flow to one of the blocks based on the
4666 // reduction supporting the atomic mode.
4667 BasicBlock *NonAtomicRedBlock =
4668 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4669 BasicBlock *AtomicRedBlock =
4670 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4671 SwitchInst *Switch =
4672 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4673 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4674 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4675
4676 // Populate the non-atomic reduction using the elementwise reduction function.
4677 // This loads the elements from the global and private variables and reduces
4678 // them before storing back the result to the global variable.
4679 Builder.SetInsertPoint(NonAtomicRedBlock);
4680 for (auto En : enumerate(ReductionInfos)) {
4681 const ReductionInfo &RI = En.value();
4683 // We have one less load for by-ref case because that load is now inside of
4684 // the reduction region
4685 Value *RedValue = RI.Variable;
4686 if (!IsByRef[En.index()]) {
4687 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4688 "red.value." + Twine(En.index()));
4689 }
4690 Value *PrivateRedValue =
4691 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4692 "red.private.value." + Twine(En.index()));
4693 Value *Reduced;
4694 InsertPointOrErrorTy AfterIP =
4695 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4696 if (!AfterIP)
4697 return AfterIP.takeError();
4698 Builder.restoreIP(*AfterIP);
4699
4700 if (!Builder.GetInsertBlock())
4701 return InsertPointTy();
4702 // for by-ref case, the load is inside of the reduction region
4703 if (!IsByRef[En.index()])
4704 Builder.CreateStore(Reduced, RI.Variable);
4705 }
4706 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4707 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4708 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4709 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4710 Builder.CreateBr(ContinuationBlock);
4711
4712 // Populate the atomic reduction using the atomic elementwise reduction
4713 // function. There are no loads/stores here because they will be happening
4714 // inside the atomic elementwise reduction.
4715 Builder.SetInsertPoint(AtomicRedBlock);
4716 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4717 for (const ReductionInfo &RI : ReductionInfos) {
4719 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4720 if (!AfterIP)
4721 return AfterIP.takeError();
4722 Builder.restoreIP(*AfterIP);
4723 if (!Builder.GetInsertBlock())
4724 return InsertPointTy();
4725 }
4726 Builder.CreateBr(ContinuationBlock);
4727 } else {
4728 Builder.CreateUnreachable();
4729 }
4730
4731 // Populate the outlined reduction function using the elementwise reduction
4732 // function. Partial values are extracted from the type-erased array of
4733 // pointers to private variables.
4734 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4735 IsByRef, /*isGPU=*/false);
4736 if (Err)
4737 return Err;
4738
4739 if (!Builder.GetInsertBlock())
4740 return InsertPointTy();
4741
4742 Builder.SetInsertPoint(ContinuationBlock);
4743 return Builder.saveIP();
4744}
4745
4748 BodyGenCallbackTy BodyGenCB,
4749 FinalizeCallbackTy FiniCB) {
4750 if (!updateToLocation(Loc))
4751 return Loc.IP;
4752
4753 Directive OMPD = Directive::OMPD_master;
4754 uint32_t SrcLocStrSize;
4755 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4756 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4757 Value *ThreadId = getOrCreateThreadID(Ident);
4758 Value *Args[] = {Ident, ThreadId};
4759
4760 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4761 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4762
4763 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4764 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4765
4766 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4767 /*Conditional*/ true, /*hasFinalize*/ true);
4768}
4769
4772 BodyGenCallbackTy BodyGenCB,
4773 FinalizeCallbackTy FiniCB, Value *Filter) {
4774 if (!updateToLocation(Loc))
4775 return Loc.IP;
4776
4777 Directive OMPD = Directive::OMPD_masked;
4778 uint32_t SrcLocStrSize;
4779 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4780 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4781 Value *ThreadId = getOrCreateThreadID(Ident);
4782 Value *Args[] = {Ident, ThreadId, Filter};
4783 Value *ArgsEnd[] = {Ident, ThreadId};
4784
4785 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4786 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4787
4788 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4789 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4790
4791 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4792 /*Conditional*/ true, /*hasFinalize*/ true);
4793}
4794
4796 llvm::FunctionCallee Callee,
4798 const llvm::Twine &Name) {
4799 llvm::CallInst *Call = Builder.CreateCall(
4800 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4801 Call->setDoesNotThrow();
4802 return Call;
4803}
4804
4805// Expects input basic block is dominated by BeforeScanBB.
4806// Once Scan directive is encountered, the code after scan directive should be
4807// dominated by AfterScanBB. Scan directive splits the code sequence to
4808// scan and input phase. Based on whether inclusive or exclusive
4809// clause is used in the scan directive and whether input loop or scan loop
4810// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4811// input loop and second is the scan loop. The code generated handles only
4812// inclusive scans now.
4814 const LocationDescription &Loc, InsertPointTy AllocaIP,
4815 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4816 bool IsInclusive, ScanInfo *ScanRedInfo) {
4817 if (ScanRedInfo->OMPFirstScanLoop) {
4818 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4819 ScanVarsType, ScanRedInfo);
4820 if (Err)
4821 return Err;
4822 }
4823 if (!updateToLocation(Loc))
4824 return Loc.IP;
4825
4826 llvm::Value *IV = ScanRedInfo->IV;
4827
4828 if (ScanRedInfo->OMPFirstScanLoop) {
4829 // Emit buffer[i] = red; at the end of the input phase.
4830 for (size_t i = 0; i < ScanVars.size(); i++) {
4831 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4832 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4833 Type *DestTy = ScanVarsType[i];
4834 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4835 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4836
4837 Builder.CreateStore(Src, Val);
4838 }
4839 }
4840 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4841 emitBlock(ScanRedInfo->OMPScanDispatch,
4842 Builder.GetInsertBlock()->getParent());
4843
4844 if (!ScanRedInfo->OMPFirstScanLoop) {
4845 IV = ScanRedInfo->IV;
4846 // Emit red = buffer[i]; at the entrance to the scan phase.
4847 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4848 for (size_t i = 0; i < ScanVars.size(); i++) {
4849 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4850 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4851 Type *DestTy = ScanVarsType[i];
4852 Value *SrcPtr =
4853 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4854 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4855 Builder.CreateStore(Src, ScanVars[i]);
4856 }
4857 }
4858
4859 // TODO: Update it to CreateBr and remove dead blocks
4860 llvm::Value *CmpI = Builder.getInt1(true);
4861 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4862 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4863 ScanRedInfo->OMPAfterScanBlock);
4864 } else {
4865 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4866 ScanRedInfo->OMPBeforeScanBlock);
4867 }
4868 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4869 Builder.GetInsertBlock()->getParent());
4870 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4871 return Builder.saveIP();
4872}
4873
4874Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4875 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4876 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4877
4878 Builder.restoreIP(AllocaIP);
4879 // Create the shared pointer at alloca IP.
4880 for (size_t i = 0; i < ScanVars.size(); i++) {
4881 llvm::Value *BuffPtr =
4882 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4883 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4884 }
4885
4886 // Allocate temporary buffer by master thread
4887 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4888 InsertPointTy CodeGenIP) -> Error {
4889 Builder.restoreIP(CodeGenIP);
4890 Value *AllocSpan =
4891 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4892 for (size_t i = 0; i < ScanVars.size(); i++) {
4893 Type *IntPtrTy = Builder.getInt32Ty();
4894 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4895 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4896 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4897 AllocSpan, nullptr, "arr");
4898 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4899 }
4900 return Error::success();
4901 };
4902 // TODO: Perform finalization actions for variables. This has to be
4903 // called for variables which have destructors/finalizers.
4904 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4905
4906 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4907 llvm::Value *FilterVal = Builder.getInt32(0);
4909 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4910
4911 if (!AfterIP)
4912 return AfterIP.takeError();
4913 Builder.restoreIP(*AfterIP);
4914 BasicBlock *InputBB = Builder.GetInsertBlock();
4915 if (InputBB->getTerminator())
4916 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4917 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4918 if (!AfterIP)
4919 return AfterIP.takeError();
4920 Builder.restoreIP(*AfterIP);
4921
4922 return Error::success();
4923}
4924
4925Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4926 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4927 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4928 InsertPointTy CodeGenIP) -> Error {
4929 Builder.restoreIP(CodeGenIP);
4930 for (ReductionInfo RedInfo : ReductionInfos) {
4931 Value *PrivateVar = RedInfo.PrivateVariable;
4932 Value *OrigVar = RedInfo.Variable;
4933 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4934 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4935
4936 Type *SrcTy = RedInfo.ElementType;
4937 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4938 "arrayOffset");
4939 Value *Src = Builder.CreateLoad(SrcTy, Val);
4940
4941 Builder.CreateStore(Src, OrigVar);
4942 Builder.CreateFree(Buff);
4943 }
4944 return Error::success();
4945 };
4946 // TODO: Perform finalization actions for variables. This has to be
4947 // called for variables which have destructors/finalizers.
4948 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4949
4950 if (ScanRedInfo->OMPScanFinish->getTerminator())
4951 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4952 else
4953 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4954
4955 llvm::Value *FilterVal = Builder.getInt32(0);
4957 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4958
4959 if (!AfterIP)
4960 return AfterIP.takeError();
4961 Builder.restoreIP(*AfterIP);
4962 BasicBlock *InputBB = Builder.GetInsertBlock();
4963 if (InputBB->getTerminator())
4964 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4965 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4966 if (!AfterIP)
4967 return AfterIP.takeError();
4968 Builder.restoreIP(*AfterIP);
4969 return Error::success();
4970}
4971
4973 const LocationDescription &Loc,
4975 ScanInfo *ScanRedInfo) {
4976
4977 if (!updateToLocation(Loc))
4978 return Loc.IP;
4979 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4980 InsertPointTy CodeGenIP) -> Error {
4981 Builder.restoreIP(CodeGenIP);
4982 Function *CurFn = Builder.GetInsertBlock()->getParent();
4983 // for (int k = 0; k <= ceil(log2(n)); ++k)
4984 llvm::BasicBlock *LoopBB =
4985 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4986 llvm::BasicBlock *ExitBB =
4987 splitBB(Builder, false, "omp.outer.log.scan.exit");
4989 Builder.GetInsertBlock()->getModule(),
4990 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4991 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4992 llvm::Value *Arg =
4993 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4994 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4996 Builder.GetInsertBlock()->getModule(),
4997 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4998 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4999 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5000 llvm::Value *NMin1 = Builder.CreateNUWSub(
5001 ScanRedInfo->Span,
5002 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5003 Builder.SetInsertPoint(InputBB);
5004 Builder.CreateBr(LoopBB);
5005 emitBlock(LoopBB, CurFn);
5006 Builder.SetInsertPoint(LoopBB);
5007
5008 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5009 // size pow2k = 1;
5010 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5011 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5012 InputBB);
5013 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5014 InputBB);
5015 // for (size i = n - 1; i >= 2 ^ k; --i)
5016 // tmp[i] op= tmp[i-pow2k];
5017 llvm::BasicBlock *InnerLoopBB =
5018 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5019 llvm::BasicBlock *InnerExitBB =
5020 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5021 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5022 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5023 emitBlock(InnerLoopBB, CurFn);
5024 Builder.SetInsertPoint(InnerLoopBB);
5025 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5026 IVal->addIncoming(NMin1, LoopBB);
5027 for (ReductionInfo RedInfo : ReductionInfos) {
5028 Value *ReductionVal = RedInfo.PrivateVariable;
5029 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5030 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5031 Type *DestTy = RedInfo.ElementType;
5032 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5033 Value *LHSPtr =
5034 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5035 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5036 Value *RHSPtr =
5037 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5038 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5039 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5040 llvm::Value *Result;
5041 InsertPointOrErrorTy AfterIP =
5042 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5043 if (!AfterIP)
5044 return AfterIP.takeError();
5045 Builder.CreateStore(Result, LHSPtr);
5046 }
5047 llvm::Value *NextIVal = Builder.CreateNUWSub(
5048 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5049 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5050 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5051 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5052 emitBlock(InnerExitBB, CurFn);
5053 llvm::Value *Next = Builder.CreateNUWAdd(
5054 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5055 Counter->addIncoming(Next, Builder.GetInsertBlock());
5056 // pow2k <<= 1;
5057 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5058 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5059 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5060 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5061 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5062 return Error::success();
5063 };
5064
5065 // TODO: Perform finalization actions for variables. This has to be
5066 // called for variables which have destructors/finalizers.
5067 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5068
5069 llvm::Value *FilterVal = Builder.getInt32(0);
5071 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5072
5073 if (!AfterIP)
5074 return AfterIP.takeError();
5075 Builder.restoreIP(*AfterIP);
5076 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5077
5078 if (!AfterIP)
5079 return AfterIP.takeError();
5080 Builder.restoreIP(*AfterIP);
5081 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5082 if (Err)
5083 return Err;
5084
5085 return AfterIP;
5086}
5087
5088Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5089 llvm::function_ref<Error()> InputLoopGen,
5090 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5091 ScanInfo *ScanRedInfo) {
5092
5093 {
5094 // Emit loop with input phase:
5095 // for (i: 0..<num_iters>) {
5096 // <input phase>;
5097 // buffer[i] = red;
5098 // }
5099 ScanRedInfo->OMPFirstScanLoop = true;
5100 Error Err = InputLoopGen();
5101 if (Err)
5102 return Err;
5103 }
5104 {
5105 // Emit loop with scan phase:
5106 // for (i: 0..<num_iters>) {
5107 // red = buffer[i];
5108 // <scan phase>;
5109 // }
5110 ScanRedInfo->OMPFirstScanLoop = false;
5111 Error Err = ScanLoopGen(Builder.saveIP());
5112 if (Err)
5113 return Err;
5114 }
5115 return Error::success();
5116}
5117
5118void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5119 Function *Fun = Builder.GetInsertBlock()->getParent();
5120 ScanRedInfo->OMPScanDispatch =
5121 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5122 ScanRedInfo->OMPAfterScanBlock =
5123 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5124 ScanRedInfo->OMPBeforeScanBlock =
5125 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5126 ScanRedInfo->OMPScanLoopExit =
5127 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5128}
5130 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5131 BasicBlock *PostInsertBefore, const Twine &Name) {
5132 Module *M = F->getParent();
5133 LLVMContext &Ctx = M->getContext();
5134 Type *IndVarTy = TripCount->getType();
5135
5136 // Create the basic block structure.
5137 BasicBlock *Preheader =
5138 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5139 BasicBlock *Header =
5140 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5141 BasicBlock *Cond =
5142 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5143 BasicBlock *Body =
5144 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5145 BasicBlock *Latch =
5146 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5147 BasicBlock *Exit =
5148 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5149 BasicBlock *After =
5150 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5151
5152 // Use specified DebugLoc for new instructions.
5153 Builder.SetCurrentDebugLocation(DL);
5154
5155 Builder.SetInsertPoint(Preheader);
5156 Builder.CreateBr(Header);
5157
5158 Builder.SetInsertPoint(Header);
5159 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5160 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5161 Builder.CreateBr(Cond);
5162
5163 Builder.SetInsertPoint(Cond);
5164 Value *Cmp =
5165 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5166 Builder.CreateCondBr(Cmp, Body, Exit);
5167
5168 Builder.SetInsertPoint(Body);
5169 Builder.CreateBr(Latch);
5170
5171 Builder.SetInsertPoint(Latch);
5172 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5173 "omp_" + Name + ".next", /*HasNUW=*/true);
5174 Builder.CreateBr(Header);
5175 IndVarPHI->addIncoming(Next, Latch);
5176
5177 Builder.SetInsertPoint(Exit);
5178 Builder.CreateBr(After);
5179
5180 // Remember and return the canonical control flow.
5181 LoopInfos.emplace_front();
5182 CanonicalLoopInfo *CL = &LoopInfos.front();
5183
5184 CL->Header = Header;
5185 CL->Cond = Cond;
5186 CL->Latch = Latch;
5187 CL->Exit = Exit;
5188
5189#ifndef NDEBUG
5190 CL->assertOK();
5191#endif
5192 return CL;
5193}
5194
5197 LoopBodyGenCallbackTy BodyGenCB,
5198 Value *TripCount, const Twine &Name) {
5199 BasicBlock *BB = Loc.IP.getBlock();
5200 BasicBlock *NextBB = BB->getNextNode();
5201
5202 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5203 NextBB, NextBB, Name);
5204 BasicBlock *After = CL->getAfter();
5205
5206 // If location is not set, don't connect the loop.
5207 if (updateToLocation(Loc)) {
5208 // Split the loop at the insertion point: Branch to the preheader and move
5209 // every following instruction to after the loop (the After BB). Also, the
5210 // new successor is the loop's after block.
5211 spliceBB(Builder, After, /*CreateBranch=*/false);
5212 Builder.CreateBr(CL->getPreheader());
5213 }
5214
5215 // Emit the body content. We do it after connecting the loop to the CFG to
5216 // avoid that the callback encounters degenerate BBs.
5217 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5218 return Err;
5219
5220#ifndef NDEBUG
5221 CL->assertOK();
5222#endif
5223 return CL;
5224}
5225
5227 ScanInfos.emplace_front();
5228 ScanInfo *Result = &ScanInfos.front();
5229 return Result;
5230}
5231
5235 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5236 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5237 LocationDescription ComputeLoc =
5238 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5239 updateToLocation(ComputeLoc);
5240
5242
5244 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5245 ScanRedInfo->Span = TripCount;
5246 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5247 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5248
5249 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5250 Builder.restoreIP(CodeGenIP);
5251 ScanRedInfo->IV = IV;
5252 createScanBBs(ScanRedInfo);
5253 BasicBlock *InputBlock = Builder.GetInsertBlock();
5254 Instruction *Terminator = InputBlock->getTerminator();
5255 assert(Terminator->getNumSuccessors() == 1);
5256 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5257 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5258 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5259 Builder.GetInsertBlock()->getParent());
5260 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5261 emitBlock(ScanRedInfo->OMPScanLoopExit,
5262 Builder.GetInsertBlock()->getParent());
5263 Builder.CreateBr(ContinueBlock);
5264 Builder.SetInsertPoint(
5265 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5266 return BodyGenCB(Builder.saveIP(), IV);
5267 };
5268
5269 const auto &&InputLoopGen = [&]() -> Error {
5271 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5272 ComputeIP, Name, true, ScanRedInfo);
5273 if (!LoopInfo)
5274 return LoopInfo.takeError();
5275 Result.push_back(*LoopInfo);
5276 Builder.restoreIP((*LoopInfo)->getAfterIP());
5277 return Error::success();
5278 };
5279 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5281 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5282 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5283 if (!LoopInfo)
5284 return LoopInfo.takeError();
5285 Result.push_back(*LoopInfo);
5286 Builder.restoreIP((*LoopInfo)->getAfterIP());
5287 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5288 return Error::success();
5289 };
5290 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5291 if (Err)
5292 return Err;
5293 return Result;
5294}
5295
5297 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5298 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5299
5300 // Consider the following difficulties (assuming 8-bit signed integers):
5301 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5302 // DO I = 1, 100, 50
5303 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5304 // DO I = 100, 0, -128
5305
5306 // Start, Stop and Step must be of the same integer type.
5307 auto *IndVarTy = cast<IntegerType>(Start->getType());
5308 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5309 assert(IndVarTy == Step->getType() && "Step type mismatch");
5310
5312
5313 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5314 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5315
5316 // Like Step, but always positive.
5317 Value *Incr = Step;
5318
5319 // Distance between Start and Stop; always positive.
5320 Value *Span;
5321
5322 // Condition whether there are no iterations are executed at all, e.g. because
5323 // UB < LB.
5324 Value *ZeroCmp;
5325
5326 if (IsSigned) {
5327 // Ensure that increment is positive. If not, negate and invert LB and UB.
5328 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5329 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5330 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5331 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5332 Span = Builder.CreateSub(UB, LB, "", false, true);
5333 ZeroCmp = Builder.CreateICmp(
5334 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5335 } else {
5336 Span = Builder.CreateSub(Stop, Start, "", true);
5337 ZeroCmp = Builder.CreateICmp(
5338 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5339 }
5340
5341 Value *CountIfLooping;
5342 if (InclusiveStop) {
5343 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5344 } else {
5345 // Avoid incrementing past stop since it could overflow.
5346 Value *CountIfTwo = Builder.CreateAdd(
5347 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5348 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5349 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5350 }
5351
5352 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5353 "omp_" + Name + ".tripcount");
5354}
5355
5358 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5359 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5360 ScanInfo *ScanRedInfo) {
5361 LocationDescription ComputeLoc =
5362 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5363
5365 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5366
5367 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5368 Builder.restoreIP(CodeGenIP);
5369 Value *Span = Builder.CreateMul(IV, Step);
5370 Value *IndVar = Builder.CreateAdd(Span, Start);
5371 if (InScan)
5372 ScanRedInfo->IV = IndVar;
5373 return BodyGenCB(Builder.saveIP(), IndVar);
5374 };
5375 LocationDescription LoopLoc =
5376 ComputeIP.isSet()
5377 ? Loc
5378 : LocationDescription(Builder.saveIP(),
5379 Builder.getCurrentDebugLocation());
5380 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5381}
5382
5383// Returns an LLVM function to call for initializing loop bounds using OpenMP
5384// static scheduling for composite `distribute parallel for` depending on
5385// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5386// integers as unsigned similarly to CanonicalLoopInfo.
5387static FunctionCallee
5389 OpenMPIRBuilder &OMPBuilder) {
5390 unsigned Bitwidth = Ty->getIntegerBitWidth();
5391 if (Bitwidth == 32)
5392 return OMPBuilder.getOrCreateRuntimeFunction(
5393 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5394 if (Bitwidth == 64)
5395 return OMPBuilder.getOrCreateRuntimeFunction(
5396 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5397 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5398}
5399
5400// Returns an LLVM function to call for initializing loop bounds using OpenMP
5401// static scheduling depending on `type`. Only i32 and i64 are supported by the
5402// runtime. Always interpret integers as unsigned similarly to
5403// CanonicalLoopInfo.
5405 OpenMPIRBuilder &OMPBuilder) {
5406 unsigned Bitwidth = Ty->getIntegerBitWidth();
5407 if (Bitwidth == 32)
5408 return OMPBuilder.getOrCreateRuntimeFunction(
5409 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5410 if (Bitwidth == 64)
5411 return OMPBuilder.getOrCreateRuntimeFunction(
5412 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5413 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5414}
5415
5416OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5417 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5418 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5419 OMPScheduleType DistScheduleSchedType) {
5420 assert(CLI->isValid() && "Requires a valid canonical loop");
5421 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5422 "Require dedicated allocate IP");
5423
5424 // Set up the source location value for OpenMP runtime.
5425 Builder.restoreIP(CLI->getPreheaderIP());
5426 Builder.SetCurrentDebugLocation(DL);
5427
5428 uint32_t SrcLocStrSize;
5429 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5430 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5431
5432 // Declare useful OpenMP runtime functions.
5433 Value *IV = CLI->getIndVar();
5434 Type *IVTy = IV->getType();
5435 FunctionCallee StaticInit =
5436 LoopType == WorksharingLoopType::DistributeForStaticLoop
5437 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5438 : getKmpcForStaticInitForType(IVTy, M, *this);
5439 FunctionCallee StaticFini =
5440 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5441
5442 // Allocate space for computed loop bounds as expected by the "init" function.
5443 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5444
5445 Type *I32Type = Type::getInt32Ty(M.getContext());
5446 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5447 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5448 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5449 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5450 CLI->setLastIter(PLastIter);
5451
5452 // At the end of the preheader, prepare for calling the "init" function by
5453 // storing the current loop bounds into the allocated space. A canonical loop
5454 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5455 // and produces an inclusive upper bound.
5456 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5457 Constant *Zero = ConstantInt::get(IVTy, 0);
5458 Constant *One = ConstantInt::get(IVTy, 1);
5459 Builder.CreateStore(Zero, PLowerBound);
5460 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5461 Builder.CreateStore(UpperBound, PUpperBound);
5462 Builder.CreateStore(One, PStride);
5463
5464 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5465
5466 OMPScheduleType SchedType =
5467 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5468 ? OMPScheduleType::OrderedDistribute
5470 Constant *SchedulingType =
5471 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5472
5473 // Call the "init" function and update the trip count of the loop with the
5474 // value it produced.
5475 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5476 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5477 this](Value *SchedulingType, auto &Builder) {
5478 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5479 PLowerBound, PUpperBound});
5480 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5481 Value *PDistUpperBound =
5482 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5483 Args.push_back(PDistUpperBound);
5484 }
5485 Args.append({PStride, One, Zero});
5486 createRuntimeFunctionCall(StaticInit, Args);
5487 };
5488 BuildInitCall(SchedulingType, Builder);
5489 if (HasDistSchedule &&
5490 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5491 Constant *DistScheduleSchedType = ConstantInt::get(
5492 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5493 // We want to emit a second init function call for the dist_schedule clause
5494 // to the Distribute construct. This should only be done however if a
5495 // Workshare Loop is nested within a Distribute Construct
5496 BuildInitCall(DistScheduleSchedType, Builder);
5497 }
5498 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5499 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5500 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5501 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5502 CLI->setTripCount(TripCount);
5503
5504 // Update all uses of the induction variable except the one in the condition
5505 // block that compares it with the actual upper bound, and the increment in
5506 // the latch block.
5507
5508 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5509 Builder.SetInsertPoint(CLI->getBody(),
5510 CLI->getBody()->getFirstInsertionPt());
5511 Builder.SetCurrentDebugLocation(DL);
5512 return Builder.CreateAdd(OldIV, LowerBound);
5513 });
5514
5515 // In the "exit" block, call the "fini" function.
5516 Builder.SetInsertPoint(CLI->getExit(),
5517 CLI->getExit()->getTerminator()->getIterator());
5518 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5519
5520 // Add the barrier if requested.
5521 if (NeedsBarrier) {
5522 InsertPointOrErrorTy BarrierIP =
5524 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5525 /* CheckCancelFlag */ false);
5526 if (!BarrierIP)
5527 return BarrierIP.takeError();
5528 }
5529
5530 InsertPointTy AfterIP = CLI->getAfterIP();
5531 CLI->invalidate();
5532
5533 return AfterIP;
5534}
5535
5536static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5537 LoopInfo &LI);
5538static void addLoopMetadata(CanonicalLoopInfo *Loop,
5539 ArrayRef<Metadata *> Properties);
5540
5542 LLVMContext &Ctx, Loop *Loop,
5544 SmallVector<Metadata *> &LoopMDList) {
5545 SmallSet<BasicBlock *, 8> Reachable;
5546
5547 // Get the basic blocks from the loop in which memref instructions
5548 // can be found.
5549 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5550 // preferably without running any passes.
5551 for (BasicBlock *Block : Loop->getBlocks()) {
5552 if (Block == CLI->getCond() || Block == CLI->getHeader())
5553 continue;
5554 Reachable.insert(Block);
5555 }
5556
5557 // Add access group metadata to memory-access instructions.
5558 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5559 for (BasicBlock *BB : Reachable)
5560 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5561 // TODO: If the loop has existing parallel access metadata, have
5562 // to combine two lists.
5563 LoopMDList.push_back(MDNode::get(
5564 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5565}
5566
5568OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5569 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5570 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5571 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5572 assert(CLI->isValid() && "Requires a valid canonical loop");
5573 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5574
5575 LLVMContext &Ctx = CLI->getFunction()->getContext();
5576 Value *IV = CLI->getIndVar();
5577 Value *OrigTripCount = CLI->getTripCount();
5578 Type *IVTy = IV->getType();
5579 assert(IVTy->getIntegerBitWidth() <= 64 &&
5580 "Max supported tripcount bitwidth is 64 bits");
5581 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5582 : Type::getInt64Ty(Ctx);
5583 Type *I32Type = Type::getInt32Ty(M.getContext());
5584 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5585 Constant *One = ConstantInt::get(InternalIVTy, 1);
5586
5587 Function *F = CLI->getFunction();
5589 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5590 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5591 LoopAnalysis LIA;
5592 LoopInfo &&LI = LIA.run(*F, FAM);
5593 Loop *L = LI.getLoopFor(CLI->getHeader());
5594 SmallVector<Metadata *> LoopMDList;
5595 if (ChunkSize || DistScheduleChunkSize)
5596 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5597 addLoopMetadata(CLI, LoopMDList);
5598
5599 // Declare useful OpenMP runtime functions.
5600 FunctionCallee StaticInit =
5601 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5602 FunctionCallee StaticFini =
5603 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5604
5605 // Allocate space for computed loop bounds as expected by the "init" function.
5606 Builder.restoreIP(AllocaIP);
5607 Builder.SetCurrentDebugLocation(DL);
5608 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5609 Value *PLowerBound =
5610 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5611 Value *PUpperBound =
5612 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5613 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5614 CLI->setLastIter(PLastIter);
5615
5616 // Set up the source location value for the OpenMP runtime.
5617 Builder.restoreIP(CLI->getPreheaderIP());
5618 Builder.SetCurrentDebugLocation(DL);
5619
5620 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5621 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5622 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5623 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5624 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5625 "distschedulechunksize");
5626 Value *CastedTripCount =
5627 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5628
5629 Constant *SchedulingType =
5630 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5631 Constant *DistSchedulingType =
5632 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5633 Builder.CreateStore(Zero, PLowerBound);
5634 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5635 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5636 Value *UpperBound =
5637 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5638 Builder.CreateStore(UpperBound, PUpperBound);
5639 Builder.CreateStore(One, PStride);
5640
5641 // Call the "init" function and update the trip count of the loop with the
5642 // value it produced.
5643 uint32_t SrcLocStrSize;
5644 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5645 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5646 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5647 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5648 PUpperBound, PStride, One,
5649 this](Value *SchedulingType, Value *ChunkSize,
5650 auto &Builder) {
5652 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5653 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5654 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5655 /*pstride=*/PStride, /*incr=*/One,
5656 /*chunk=*/ChunkSize});
5657 };
5658 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5659 if (DistScheduleSchedType != OMPScheduleType::None &&
5660 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5661 SchedType != OMPScheduleType::OrderedDistribute) {
5662 // We want to emit a second init function call for the dist_schedule clause
5663 // to the Distribute construct. This should only be done however if a
5664 // Workshare Loop is nested within a Distribute Construct
5665 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5666 }
5667
5668 // Load values written by the "init" function.
5669 Value *FirstChunkStart =
5670 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5671 Value *FirstChunkStop =
5672 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5673 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5674 Value *ChunkRange =
5675 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5676 Value *NextChunkStride =
5677 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5678
5679 // Create outer "dispatch" loop for enumerating the chunks.
5680 BasicBlock *DispatchEnter = splitBB(Builder, true);
5681 Value *DispatchCounter;
5682
5683 // It is safe to assume this didn't return an error because the callback
5684 // passed into createCanonicalLoop is the only possible error source, and it
5685 // always returns success.
5686 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5687 {Builder.saveIP(), DL},
5688 [&](InsertPointTy BodyIP, Value *Counter) {
5689 DispatchCounter = Counter;
5690 return Error::success();
5691 },
5692 FirstChunkStart, CastedTripCount, NextChunkStride,
5693 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5694 "dispatch"));
5695
5696 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5697 // not have to preserve the canonical invariant.
5698 BasicBlock *DispatchBody = DispatchCLI->getBody();
5699 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5700 BasicBlock *DispatchExit = DispatchCLI->getExit();
5701 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5702 DispatchCLI->invalidate();
5703
5704 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5705 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5706 redirectTo(CLI->getExit(), DispatchLatch, DL);
5707 redirectTo(DispatchBody, DispatchEnter, DL);
5708
5709 // Prepare the prolog of the chunk loop.
5710 Builder.restoreIP(CLI->getPreheaderIP());
5711 Builder.SetCurrentDebugLocation(DL);
5712
5713 // Compute the number of iterations of the chunk loop.
5714 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5715 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5716 Value *IsLastChunk =
5717 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5718 Value *CountUntilOrigTripCount =
5719 Builder.CreateSub(CastedTripCount, DispatchCounter);
5720 Value *ChunkTripCount = Builder.CreateSelect(
5721 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5722 Value *BackcastedChunkTC =
5723 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5724 CLI->setTripCount(BackcastedChunkTC);
5725
5726 // Update all uses of the induction variable except the one in the condition
5727 // block that compares it with the actual upper bound, and the increment in
5728 // the latch block.
5729 Value *BackcastedDispatchCounter =
5730 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5731 CLI->mapIndVar([&](Instruction *) -> Value * {
5732 Builder.restoreIP(CLI->getBodyIP());
5733 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5734 });
5735
5736 // In the "exit" block, call the "fini" function.
5737 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5738 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5739
5740 // Add the barrier if requested.
5741 if (NeedsBarrier) {
5742 InsertPointOrErrorTy AfterIP =
5743 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5744 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5745 if (!AfterIP)
5746 return AfterIP.takeError();
5747 }
5748
5749#ifndef NDEBUG
5750 // Even though we currently do not support applying additional methods to it,
5751 // the chunk loop should remain a canonical loop.
5752 CLI->assertOK();
5753#endif
5754
5755 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5756}
5757
5758// Returns an LLVM function to call for executing an OpenMP static worksharing
5759// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5760// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5761static FunctionCallee
5763 WorksharingLoopType LoopType) {
5764 unsigned Bitwidth = Ty->getIntegerBitWidth();
5765 Module &M = OMPBuilder->M;
5766 switch (LoopType) {
5767 case WorksharingLoopType::ForStaticLoop:
5768 if (Bitwidth == 32)
5769 return OMPBuilder->getOrCreateRuntimeFunction(
5770 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5771 if (Bitwidth == 64)
5772 return OMPBuilder->getOrCreateRuntimeFunction(
5773 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5774 break;
5775 case WorksharingLoopType::DistributeStaticLoop:
5776 if (Bitwidth == 32)
5777 return OMPBuilder->getOrCreateRuntimeFunction(
5778 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5779 if (Bitwidth == 64)
5780 return OMPBuilder->getOrCreateRuntimeFunction(
5781 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5782 break;
5783 case WorksharingLoopType::DistributeForStaticLoop:
5784 if (Bitwidth == 32)
5785 return OMPBuilder->getOrCreateRuntimeFunction(
5786 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5787 if (Bitwidth == 64)
5788 return OMPBuilder->getOrCreateRuntimeFunction(
5789 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5790 break;
5791 }
5792 if (Bitwidth != 32 && Bitwidth != 64) {
5793 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5794 }
5795 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5796}
5797
5798// Inserts a call to proper OpenMP Device RTL function which handles
5799// loop worksharing.
5801 WorksharingLoopType LoopType,
5802 BasicBlock *InsertBlock, Value *Ident,
5803 Value *LoopBodyArg, Value *TripCount,
5804 Function &LoopBodyFn, bool NoLoop) {
5805 Type *TripCountTy = TripCount->getType();
5806 Module &M = OMPBuilder->M;
5807 IRBuilder<> &Builder = OMPBuilder->Builder;
5808 FunctionCallee RTLFn =
5809 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5810 SmallVector<Value *, 8> RealArgs;
5811 RealArgs.push_back(Ident);
5812 RealArgs.push_back(&LoopBodyFn);
5813 RealArgs.push_back(LoopBodyArg);
5814 RealArgs.push_back(TripCount);
5815 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5816 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5817 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5818 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5819 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5820 return;
5821 }
5822 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5823 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5824 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5825 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5826
5827 RealArgs.push_back(
5828 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5829 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5830 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5831 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5832 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5833 } else {
5834 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5835 }
5836
5837 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5838}
5839
5841 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5842 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5843 WorksharingLoopType LoopType, bool NoLoop) {
5844 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5845 BasicBlock *Preheader = CLI->getPreheader();
5846 Value *TripCount = CLI->getTripCount();
5847
5848 // After loop body outling, the loop body contains only set up
5849 // of loop body argument structure and the call to the outlined
5850 // loop body function. Firstly, we need to move setup of loop body args
5851 // into loop preheader.
5852 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5853 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5854
5855 // The next step is to remove the whole loop. We do not it need anymore.
5856 // That's why make an unconditional branch from loop preheader to loop
5857 // exit block
5858 Builder.restoreIP({Preheader, Preheader->end()});
5859 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5860 Preheader->getTerminator()->eraseFromParent();
5861 Builder.CreateBr(CLI->getExit());
5862
5863 // Delete dead loop blocks
5864 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5865 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5866 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5867 CleanUpInfo.EntryBB = CLI->getHeader();
5868 CleanUpInfo.ExitBB = CLI->getExit();
5869 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5870 DeleteDeadBlocks(BlocksToBeRemoved);
5871
5872 // Find the instruction which corresponds to loop body argument structure
5873 // and remove the call to loop body function instruction.
5874 Value *LoopBodyArg;
5875 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5876 assert(OutlinedFnUser &&
5877 "Expected unique undroppable user of outlined function");
5878 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5879 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5880 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5881 "Expected outlined function call to be located in loop preheader");
5882 // Check in case no argument structure has been passed.
5883 if (OutlinedFnCallInstruction->arg_size() > 1)
5884 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5885 else
5886 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5887 OutlinedFnCallInstruction->eraseFromParent();
5888
5889 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5890 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5891
5892 for (auto &ToBeDeletedItem : ToBeDeleted)
5893 ToBeDeletedItem->eraseFromParent();
5894 CLI->invalidate();
5895}
5896
5897OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5898 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5899 WorksharingLoopType LoopType, bool NoLoop) {
5900 uint32_t SrcLocStrSize;
5901 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5902 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5903
5904 OutlineInfo OI;
5905 OI.OuterAllocaBB = CLI->getPreheader();
5906 Function *OuterFn = CLI->getPreheader()->getParent();
5907
5908 // Instructions which need to be deleted at the end of code generation
5909 SmallVector<Instruction *, 4> ToBeDeleted;
5910
5911 OI.OuterAllocaBB = AllocaIP.getBlock();
5912
5913 // Mark the body loop as region which needs to be extracted
5914 OI.EntryBB = CLI->getBody();
5915 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
5916 "omp.prelatch");
5917
5918 // Prepare loop body for extraction
5919 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5920
5921 // Insert new loop counter variable which will be used only in loop
5922 // body.
5923 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5924 Instruction *NewLoopCntLoad =
5925 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5926 // New loop counter instructions are redundant in the loop preheader when
5927 // code generation for workshare loop is finshed. That's why mark them as
5928 // ready for deletion.
5929 ToBeDeleted.push_back(NewLoopCntLoad);
5930 ToBeDeleted.push_back(NewLoopCnt);
5931
5932 // Analyse loop body region. Find all input variables which are used inside
5933 // loop body region.
5934 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5936 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5937
5938 CodeExtractorAnalysisCache CEAC(*OuterFn);
5939 CodeExtractor Extractor(Blocks,
5940 /* DominatorTree */ nullptr,
5941 /* AggregateArgs */ true,
5942 /* BlockFrequencyInfo */ nullptr,
5943 /* BranchProbabilityInfo */ nullptr,
5944 /* AssumptionCache */ nullptr,
5945 /* AllowVarArgs */ true,
5946 /* AllowAlloca */ true,
5947 /* AllocationBlock */ CLI->getPreheader(),
5948 /* Suffix */ ".omp_wsloop",
5949 /* AggrArgsIn0AddrSpace */ true);
5950
5951 BasicBlock *CommonExit = nullptr;
5952 SetVector<Value *> SinkingCands, HoistingCands;
5953
5954 // Find allocas outside the loop body region which are used inside loop
5955 // body
5956 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5957
5958 // We need to model loop body region as the function f(cnt, loop_arg).
5959 // That's why we replace loop induction variable by the new counter
5960 // which will be one of loop body function argument
5962 CLI->getIndVar()->user_end());
5963 for (auto Use : Users) {
5964 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5965 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5966 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5967 }
5968 }
5969 }
5970 // Make sure that loop counter variable is not merged into loop body
5971 // function argument structure and it is passed as separate variable
5972 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5973
5974 // PostOutline CB is invoked when loop body function is outlined and
5975 // loop body is replaced by call to outlined function. We need to add
5976 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5977 // function will handle loop control logic.
5978 //
5979 OI.PostOutlineCB = [=, ToBeDeletedVec =
5980 std::move(ToBeDeleted)](Function &OutlinedFn) {
5981 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5982 LoopType, NoLoop);
5983 };
5984 addOutlineInfo(std::move(OI));
5985 return CLI->getAfterIP();
5986}
5987
5990 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5991 bool HasSimdModifier, bool HasMonotonicModifier,
5992 bool HasNonmonotonicModifier, bool HasOrderedClause,
5993 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5994 Value *DistScheduleChunkSize) {
5995 if (Config.isTargetDevice())
5996 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5997 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5998 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5999 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6000
6001 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6002 OMPScheduleType::ModifierOrdered;
6003 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6004 if (HasDistSchedule) {
6005 DistScheduleSchedType = DistScheduleChunkSize
6006 ? OMPScheduleType::OrderedDistributeChunked
6007 : OMPScheduleType::OrderedDistribute;
6008 }
6009 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6010 case OMPScheduleType::BaseStatic:
6011 case OMPScheduleType::BaseDistribute:
6012 assert((!ChunkSize || !DistScheduleChunkSize) &&
6013 "No chunk size with static-chunked schedule");
6014 if (IsOrdered && !HasDistSchedule)
6015 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6016 NeedsBarrier, ChunkSize);
6017 // FIXME: Monotonicity ignored?
6018 if (DistScheduleChunkSize)
6019 return applyStaticChunkedWorkshareLoop(
6020 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6021 DistScheduleChunkSize, DistScheduleSchedType);
6022 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6023 HasDistSchedule);
6024
6025 case OMPScheduleType::BaseStaticChunked:
6026 case OMPScheduleType::BaseDistributeChunked:
6027 if (IsOrdered && !HasDistSchedule)
6028 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6029 NeedsBarrier, ChunkSize);
6030 // FIXME: Monotonicity ignored?
6031 return applyStaticChunkedWorkshareLoop(
6032 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6033 DistScheduleChunkSize, DistScheduleSchedType);
6034
6035 case OMPScheduleType::BaseRuntime:
6036 case OMPScheduleType::BaseAuto:
6037 case OMPScheduleType::BaseGreedy:
6038 case OMPScheduleType::BaseBalanced:
6039 case OMPScheduleType::BaseSteal:
6040 case OMPScheduleType::BaseRuntimeSimd:
6041 assert(!ChunkSize &&
6042 "schedule type does not support user-defined chunk sizes");
6043 [[fallthrough]];
6044 case OMPScheduleType::BaseGuidedSimd:
6045 case OMPScheduleType::BaseDynamicChunked:
6046 case OMPScheduleType::BaseGuidedChunked:
6047 case OMPScheduleType::BaseGuidedIterativeChunked:
6048 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6049 case OMPScheduleType::BaseStaticBalancedChunked:
6050 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6051 NeedsBarrier, ChunkSize);
6052
6053 default:
6054 llvm_unreachable("Unknown/unimplemented schedule kind");
6055 }
6056}
6057
6058/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6059/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6060/// the runtime. Always interpret integers as unsigned similarly to
6061/// CanonicalLoopInfo.
6062static FunctionCallee
6064 unsigned Bitwidth = Ty->getIntegerBitWidth();
6065 if (Bitwidth == 32)
6066 return OMPBuilder.getOrCreateRuntimeFunction(
6067 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6068 if (Bitwidth == 64)
6069 return OMPBuilder.getOrCreateRuntimeFunction(
6070 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6071 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6072}
6073
6074/// Returns an LLVM function to call for updating the next loop using OpenMP
6075/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6076/// the runtime. Always interpret integers as unsigned similarly to
6077/// CanonicalLoopInfo.
6078static FunctionCallee
6080 unsigned Bitwidth = Ty->getIntegerBitWidth();
6081 if (Bitwidth == 32)
6082 return OMPBuilder.getOrCreateRuntimeFunction(
6083 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6084 if (Bitwidth == 64)
6085 return OMPBuilder.getOrCreateRuntimeFunction(
6086 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6087 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6088}
6089
6090/// Returns an LLVM function to call for finalizing the dynamic loop using
6091/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6092/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6093static FunctionCallee
6095 unsigned Bitwidth = Ty->getIntegerBitWidth();
6096 if (Bitwidth == 32)
6097 return OMPBuilder.getOrCreateRuntimeFunction(
6098 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6099 if (Bitwidth == 64)
6100 return OMPBuilder.getOrCreateRuntimeFunction(
6101 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6102 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6103}
6104
6106OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6107 InsertPointTy AllocaIP,
6108 OMPScheduleType SchedType,
6109 bool NeedsBarrier, Value *Chunk) {
6110 assert(CLI->isValid() && "Requires a valid canonical loop");
6111 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6112 "Require dedicated allocate IP");
6114 "Require valid schedule type");
6115
6116 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6117 OMPScheduleType::ModifierOrdered;
6118
6119 // Set up the source location value for OpenMP runtime.
6120 Builder.SetCurrentDebugLocation(DL);
6121
6122 uint32_t SrcLocStrSize;
6123 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6124 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6125
6126 // Declare useful OpenMP runtime functions.
6127 Value *IV = CLI->getIndVar();
6128 Type *IVTy = IV->getType();
6129 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6130 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6131
6132 // Allocate space for computed loop bounds as expected by the "init" function.
6133 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6134 Type *I32Type = Type::getInt32Ty(M.getContext());
6135 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6136 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6137 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6138 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6139 CLI->setLastIter(PLastIter);
6140
6141 // At the end of the preheader, prepare for calling the "init" function by
6142 // storing the current loop bounds into the allocated space. A canonical loop
6143 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6144 // and produces an inclusive upper bound.
6145 BasicBlock *PreHeader = CLI->getPreheader();
6146 Builder.SetInsertPoint(PreHeader->getTerminator());
6147 Constant *One = ConstantInt::get(IVTy, 1);
6148 Builder.CreateStore(One, PLowerBound);
6149 Value *UpperBound = CLI->getTripCount();
6150 Builder.CreateStore(UpperBound, PUpperBound);
6151 Builder.CreateStore(One, PStride);
6152
6153 BasicBlock *Header = CLI->getHeader();
6154 BasicBlock *Exit = CLI->getExit();
6155 BasicBlock *Cond = CLI->getCond();
6156 BasicBlock *Latch = CLI->getLatch();
6157 InsertPointTy AfterIP = CLI->getAfterIP();
6158
6159 // The CLI will be "broken" in the code below, as the loop is no longer
6160 // a valid canonical loop.
6161
6162 if (!Chunk)
6163 Chunk = One;
6164
6165 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6166
6167 Constant *SchedulingType =
6168 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6169
6170 // Call the "init" function.
6171 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6172 /* LowerBound */ One, UpperBound,
6173 /* step */ One, Chunk});
6174
6175 // An outer loop around the existing one.
6176 BasicBlock *OuterCond = BasicBlock::Create(
6177 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6178 PreHeader->getParent());
6179 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6180 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6182 DynamicNext,
6183 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6184 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6185 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6186 Value *LowerBound =
6187 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6188 Builder.CreateCondBr(MoreWork, Header, Exit);
6189
6190 // Change PHI-node in loop header to use outer cond rather than preheader,
6191 // and set IV to the LowerBound.
6192 Instruction *Phi = &Header->front();
6193 auto *PI = cast<PHINode>(Phi);
6194 PI->setIncomingBlock(0, OuterCond);
6195 PI->setIncomingValue(0, LowerBound);
6196
6197 // Then set the pre-header to jump to the OuterCond
6198 Instruction *Term = PreHeader->getTerminator();
6199 auto *Br = cast<BranchInst>(Term);
6200 Br->setSuccessor(0, OuterCond);
6201
6202 // Modify the inner condition:
6203 // * Use the UpperBound returned from the DynamicNext call.
6204 // * jump to the loop outer loop when done with one of the inner loops.
6205 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6206 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6207 Instruction *Comp = &*Builder.GetInsertPoint();
6208 auto *CI = cast<CmpInst>(Comp);
6209 CI->setOperand(1, UpperBound);
6210 // Redirect the inner exit to branch to outer condition.
6211 Instruction *Branch = &Cond->back();
6212 auto *BI = cast<BranchInst>(Branch);
6213 assert(BI->getSuccessor(1) == Exit);
6214 BI->setSuccessor(1, OuterCond);
6215
6216 // Call the "fini" function if "ordered" is present in wsloop directive.
6217 if (Ordered) {
6218 Builder.SetInsertPoint(&Latch->back());
6219 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6220 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6221 }
6222
6223 // Add the barrier if requested.
6224 if (NeedsBarrier) {
6225 Builder.SetInsertPoint(&Exit->back());
6226 InsertPointOrErrorTy BarrierIP =
6228 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6229 /* CheckCancelFlag */ false);
6230 if (!BarrierIP)
6231 return BarrierIP.takeError();
6232 }
6233
6234 CLI->invalidate();
6235 return AfterIP;
6236}
6237
6238/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6239/// after this \p OldTarget will be orphaned.
6241 BasicBlock *NewTarget, DebugLoc DL) {
6242 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6243 redirectTo(Pred, NewTarget, DL);
6244}
6245
6246/// Determine which blocks in \p BBs are reachable from outside and remove the
6247/// ones that are not reachable from the function.
6250 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6251 for (Use &U : BB->uses()) {
6252 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6253 if (!UseInst)
6254 continue;
6255 if (BBsToErase.count(UseInst->getParent()))
6256 continue;
6257 return true;
6258 }
6259 return false;
6260 };
6261
6262 while (BBsToErase.remove_if(HasRemainingUses)) {
6263 // Try again if anything was removed.
6264 }
6265
6266 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6267 DeleteDeadBlocks(BBVec);
6268}
6269
6270CanonicalLoopInfo *
6272 InsertPointTy ComputeIP) {
6273 assert(Loops.size() >= 1 && "At least one loop required");
6274 size_t NumLoops = Loops.size();
6275
6276 // Nothing to do if there is already just one loop.
6277 if (NumLoops == 1)
6278 return Loops.front();
6279
6280 CanonicalLoopInfo *Outermost = Loops.front();
6281 CanonicalLoopInfo *Innermost = Loops.back();
6282 BasicBlock *OrigPreheader = Outermost->getPreheader();
6283 BasicBlock *OrigAfter = Outermost->getAfter();
6284 Function *F = OrigPreheader->getParent();
6285
6286 // Loop control blocks that may become orphaned later.
6287 SmallVector<BasicBlock *, 12> OldControlBBs;
6288 OldControlBBs.reserve(6 * Loops.size());
6290 Loop->collectControlBlocks(OldControlBBs);
6291
6292 // Setup the IRBuilder for inserting the trip count computation.
6293 Builder.SetCurrentDebugLocation(DL);
6294 if (ComputeIP.isSet())
6295 Builder.restoreIP(ComputeIP);
6296 else
6297 Builder.restoreIP(Outermost->getPreheaderIP());
6298
6299 // Derive the collapsed' loop trip count.
6300 // TODO: Find common/largest indvar type.
6301 Value *CollapsedTripCount = nullptr;
6302 for (CanonicalLoopInfo *L : Loops) {
6303 assert(L->isValid() &&
6304 "All loops to collapse must be valid canonical loops");
6305 Value *OrigTripCount = L->getTripCount();
6306 if (!CollapsedTripCount) {
6307 CollapsedTripCount = OrigTripCount;
6308 continue;
6309 }
6310
6311 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6312 CollapsedTripCount =
6313 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6314 }
6315
6316 // Create the collapsed loop control flow.
6317 CanonicalLoopInfo *Result =
6318 createLoopSkeleton(DL, CollapsedTripCount, F,
6319 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6320
6321 // Build the collapsed loop body code.
6322 // Start with deriving the input loop induction variables from the collapsed
6323 // one, using a divmod scheme. To preserve the original loops' order, the
6324 // innermost loop use the least significant bits.
6325 Builder.restoreIP(Result->getBodyIP());
6326
6327 Value *Leftover = Result->getIndVar();
6328 SmallVector<Value *> NewIndVars;
6329 NewIndVars.resize(NumLoops);
6330 for (int i = NumLoops - 1; i >= 1; --i) {
6331 Value *OrigTripCount = Loops[i]->getTripCount();
6332
6333 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6334 NewIndVars[i] = NewIndVar;
6335
6336 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6337 }
6338 // Outermost loop gets all the remaining bits.
6339 NewIndVars[0] = Leftover;
6340
6341 // Construct the loop body control flow.
6342 // We progressively construct the branch structure following in direction of
6343 // the control flow, from the leading in-between code, the loop nest body, the
6344 // trailing in-between code, and rejoining the collapsed loop's latch.
6345 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6346 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6347 // its predecessors as sources.
6348 BasicBlock *ContinueBlock = Result->getBody();
6349 BasicBlock *ContinuePred = nullptr;
6350 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6351 BasicBlock *NextSrc) {
6352 if (ContinueBlock)
6353 redirectTo(ContinueBlock, Dest, DL);
6354 else
6355 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6356
6357 ContinueBlock = nullptr;
6358 ContinuePred = NextSrc;
6359 };
6360
6361 // The code before the nested loop of each level.
6362 // Because we are sinking it into the nest, it will be executed more often
6363 // that the original loop. More sophisticated schemes could keep track of what
6364 // the in-between code is and instantiate it only once per thread.
6365 for (size_t i = 0; i < NumLoops - 1; ++i)
6366 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6367
6368 // Connect the loop nest body.
6369 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6370
6371 // The code after the nested loop at each level.
6372 for (size_t i = NumLoops - 1; i > 0; --i)
6373 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6374
6375 // Connect the finished loop to the collapsed loop latch.
6376 ContinueWith(Result->getLatch(), nullptr);
6377
6378 // Replace the input loops with the new collapsed loop.
6379 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6380 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6381
6382 // Replace the input loop indvars with the derived ones.
6383 for (size_t i = 0; i < NumLoops; ++i)
6384 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6385
6386 // Remove unused parts of the input loops.
6387 removeUnusedBlocksFromParent(OldControlBBs);
6388
6389 for (CanonicalLoopInfo *L : Loops)
6390 L->invalidate();
6391
6392#ifndef NDEBUG
6393 Result->assertOK();
6394#endif
6395 return Result;
6396}
6397
6398std::vector<CanonicalLoopInfo *>
6400 ArrayRef<Value *> TileSizes) {
6401 assert(TileSizes.size() == Loops.size() &&
6402 "Must pass as many tile sizes as there are loops");
6403 int NumLoops = Loops.size();
6404 assert(NumLoops >= 1 && "At least one loop to tile required");
6405
6406 CanonicalLoopInfo *OutermostLoop = Loops.front();
6407 CanonicalLoopInfo *InnermostLoop = Loops.back();
6408 Function *F = OutermostLoop->getBody()->getParent();
6409 BasicBlock *InnerEnter = InnermostLoop->getBody();
6410 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6411
6412 // Loop control blocks that may become orphaned later.
6413 SmallVector<BasicBlock *, 12> OldControlBBs;
6414 OldControlBBs.reserve(6 * Loops.size());
6416 Loop->collectControlBlocks(OldControlBBs);
6417
6418 // Collect original trip counts and induction variable to be accessible by
6419 // index. Also, the structure of the original loops is not preserved during
6420 // the construction of the tiled loops, so do it before we scavenge the BBs of
6421 // any original CanonicalLoopInfo.
6422 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6423 for (CanonicalLoopInfo *L : Loops) {
6424 assert(L->isValid() && "All input loops must be valid canonical loops");
6425 OrigTripCounts.push_back(L->getTripCount());
6426 OrigIndVars.push_back(L->getIndVar());
6427 }
6428
6429 // Collect the code between loop headers. These may contain SSA definitions
6430 // that are used in the loop nest body. To be usable with in the innermost
6431 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6432 // these instructions may be executed more often than before the tiling.
6433 // TODO: It would be sufficient to only sink them into body of the
6434 // corresponding tile loop.
6436 for (int i = 0; i < NumLoops - 1; ++i) {
6437 CanonicalLoopInfo *Surrounding = Loops[i];
6438 CanonicalLoopInfo *Nested = Loops[i + 1];
6439
6440 BasicBlock *EnterBB = Surrounding->getBody();
6441 BasicBlock *ExitBB = Nested->getHeader();
6442 InbetweenCode.emplace_back(EnterBB, ExitBB);
6443 }
6444
6445 // Compute the trip counts of the floor loops.
6446 Builder.SetCurrentDebugLocation(DL);
6447 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6448 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6449 for (int i = 0; i < NumLoops; ++i) {
6450 Value *TileSize = TileSizes[i];
6451 Value *OrigTripCount = OrigTripCounts[i];
6452 Type *IVType = OrigTripCount->getType();
6453
6454 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6455 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6456
6457 // 0 if tripcount divides the tilesize, 1 otherwise.
6458 // 1 means we need an additional iteration for a partial tile.
6459 //
6460 // Unfortunately we cannot just use the roundup-formula
6461 // (tripcount + tilesize - 1)/tilesize
6462 // because the summation might overflow. We do not want introduce undefined
6463 // behavior when the untiled loop nest did not.
6464 Value *FloorTripOverflow =
6465 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6466
6467 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6468 Value *FloorTripCount =
6469 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6470 "omp_floor" + Twine(i) + ".tripcount", true);
6471
6472 // Remember some values for later use.
6473 FloorCompleteCount.push_back(FloorCompleteTripCount);
6474 FloorCount.push_back(FloorTripCount);
6475 FloorRems.push_back(FloorTripRem);
6476 }
6477
6478 // Generate the new loop nest, from the outermost to the innermost.
6479 std::vector<CanonicalLoopInfo *> Result;
6480 Result.reserve(NumLoops * 2);
6481
6482 // The basic block of the surrounding loop that enters the nest generated
6483 // loop.
6484 BasicBlock *Enter = OutermostLoop->getPreheader();
6485
6486 // The basic block of the surrounding loop where the inner code should
6487 // continue.
6488 BasicBlock *Continue = OutermostLoop->getAfter();
6489
6490 // Where the next loop basic block should be inserted.
6491 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6492
6493 auto EmbeddNewLoop =
6494 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6495 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6496 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6497 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6498 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6499 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6500
6501 // Setup the position where the next embedded loop connects to this loop.
6502 Enter = EmbeddedLoop->getBody();
6503 Continue = EmbeddedLoop->getLatch();
6504 OutroInsertBefore = EmbeddedLoop->getLatch();
6505 return EmbeddedLoop;
6506 };
6507
6508 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6509 const Twine &NameBase) {
6510 for (auto P : enumerate(TripCounts)) {
6511 CanonicalLoopInfo *EmbeddedLoop =
6512 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6513 Result.push_back(EmbeddedLoop);
6514 }
6515 };
6516
6517 EmbeddNewLoops(FloorCount, "floor");
6518
6519 // Within the innermost floor loop, emit the code that computes the tile
6520 // sizes.
6521 Builder.SetInsertPoint(Enter->getTerminator());
6522 SmallVector<Value *, 4> TileCounts;
6523 for (int i = 0; i < NumLoops; ++i) {
6524 CanonicalLoopInfo *FloorLoop = Result[i];
6525 Value *TileSize = TileSizes[i];
6526
6527 Value *FloorIsEpilogue =
6528 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6529 Value *TileTripCount =
6530 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6531
6532 TileCounts.push_back(TileTripCount);
6533 }
6534
6535 // Create the tile loops.
6536 EmbeddNewLoops(TileCounts, "tile");
6537
6538 // Insert the inbetween code into the body.
6539 BasicBlock *BodyEnter = Enter;
6540 BasicBlock *BodyEntered = nullptr;
6541 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6542 BasicBlock *EnterBB = P.first;
6543 BasicBlock *ExitBB = P.second;
6544
6545 if (BodyEnter)
6546 redirectTo(BodyEnter, EnterBB, DL);
6547 else
6548 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6549
6550 BodyEnter = nullptr;
6551 BodyEntered = ExitBB;
6552 }
6553
6554 // Append the original loop nest body into the generated loop nest body.
6555 if (BodyEnter)
6556 redirectTo(BodyEnter, InnerEnter, DL);
6557 else
6558 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6560
6561 // Replace the original induction variable with an induction variable computed
6562 // from the tile and floor induction variables.
6563 Builder.restoreIP(Result.back()->getBodyIP());
6564 for (int i = 0; i < NumLoops; ++i) {
6565 CanonicalLoopInfo *FloorLoop = Result[i];
6566 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6567 Value *OrigIndVar = OrigIndVars[i];
6568 Value *Size = TileSizes[i];
6569
6570 Value *Scale =
6571 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6572 Value *Shift =
6573 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6574 OrigIndVar->replaceAllUsesWith(Shift);
6575 }
6576
6577 // Remove unused parts of the original loops.
6578 removeUnusedBlocksFromParent(OldControlBBs);
6579
6580 for (CanonicalLoopInfo *L : Loops)
6581 L->invalidate();
6582
6583#ifndef NDEBUG
6584 for (CanonicalLoopInfo *GenL : Result)
6585 GenL->assertOK();
6586#endif
6587 return Result;
6588}
6589
6590/// Attach metadata \p Properties to the basic block described by \p BB. If the
6591/// basic block already has metadata, the basic block properties are appended.
6593 ArrayRef<Metadata *> Properties) {
6594 // Nothing to do if no property to attach.
6595 if (Properties.empty())
6596 return;
6597
6598 LLVMContext &Ctx = BB->getContext();
6599 SmallVector<Metadata *> NewProperties;
6600 NewProperties.push_back(nullptr);
6601
6602 // If the basic block already has metadata, prepend it to the new metadata.
6603 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6604 if (Existing)
6605 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6606
6607 append_range(NewProperties, Properties);
6608 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6609 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6610
6611 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6612}
6613
6614/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6615/// loop already has metadata, the loop properties are appended.
6617 ArrayRef<Metadata *> Properties) {
6618 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6619
6620 // Attach metadata to the loop's latch
6621 BasicBlock *Latch = Loop->getLatch();
6622 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6623 addBasicBlockMetadata(Latch, Properties);
6624}
6625
6626/// Attach llvm.access.group metadata to the memref instructions of \p Block
6628 LoopInfo &LI) {
6629 for (Instruction &I : *Block) {
6630 if (I.mayReadOrWriteMemory()) {
6631 // TODO: This instruction may already have access group from
6632 // other pragmas e.g. #pragma clang loop vectorize. Append
6633 // so that the existing metadata is not overwritten.
6634 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6635 }
6636 }
6637}
6638
6639CanonicalLoopInfo *
6641 CanonicalLoopInfo *firstLoop = Loops.front();
6642 CanonicalLoopInfo *lastLoop = Loops.back();
6643 Function *F = firstLoop->getPreheader()->getParent();
6644
6645 // Loop control blocks that will become orphaned later
6646 SmallVector<BasicBlock *> oldControlBBs;
6648 Loop->collectControlBlocks(oldControlBBs);
6649
6650 // Collect original trip counts
6651 SmallVector<Value *> origTripCounts;
6652 for (CanonicalLoopInfo *L : Loops) {
6653 assert(L->isValid() && "All input loops must be valid canonical loops");
6654 origTripCounts.push_back(L->getTripCount());
6655 }
6656
6657 Builder.SetCurrentDebugLocation(DL);
6658
6659 // Compute max trip count.
6660 // The fused loop will be from 0 to max(origTripCounts)
6661 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6662 F, firstLoop->getHeader());
6663 Builder.SetInsertPoint(TCBlock);
6664 Value *fusedTripCount = nullptr;
6665 for (CanonicalLoopInfo *L : Loops) {
6666 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6667 Value *origTripCount = L->getTripCount();
6668 if (!fusedTripCount) {
6669 fusedTripCount = origTripCount;
6670 continue;
6671 }
6672 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6673 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6674 ".omp.fuse.tc");
6675 }
6676
6677 // Generate new loop
6678 CanonicalLoopInfo *fused =
6679 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6680 lastLoop->getLatch(), "fused");
6681
6682 // Replace original loops with the fused loop
6683 // Preheader and After are not considered inside the CLI.
6684 // These are used to compute the individual TCs of the loops
6685 // so they have to be put before the resulting fused loop.
6686 // Moving them up for readability.
6687 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6688 Loops[i]->getPreheader()->moveBefore(TCBlock);
6689 Loops[i]->getAfter()->moveBefore(TCBlock);
6690 }
6691 lastLoop->getPreheader()->moveBefore(TCBlock);
6692
6693 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6694 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6695 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6696 }
6697 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6698 redirectTo(TCBlock, fused->getPreheader(), DL);
6699 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6700
6701 // Build the fused body
6702 // Create new Blocks with conditions that jump to the original loop bodies
6704 SmallVector<Value *> condValues;
6705 for (size_t i = 0; i < Loops.size(); ++i) {
6706 BasicBlock *condBlock = BasicBlock::Create(
6707 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6708 Builder.SetInsertPoint(condBlock);
6709 Value *condValue =
6710 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6711 condBBs.push_back(condBlock);
6712 condValues.push_back(condValue);
6713 }
6714 // Join the condition blocks with the bodies of the original loops
6715 redirectTo(fused->getBody(), condBBs[0], DL);
6716 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6717 Builder.SetInsertPoint(condBBs[i]);
6718 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6719 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6720 // Replace the IV with the fused IV
6721 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6722 }
6723 // Last body jumps to the created end body block
6724 Builder.SetInsertPoint(condBBs.back());
6725 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6726 fused->getLatch());
6727 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6728 // Replace the IV with the fused IV
6729 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6730
6731 // The loop latch must have only one predecessor. Currently it is branched to
6732 // from both the last condition block and the last loop body
6733 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6734 "omp.fused.pre_latch");
6735
6736 // Remove unused parts
6737 removeUnusedBlocksFromParent(oldControlBBs);
6738
6739 // Invalidate old CLIs
6740 for (CanonicalLoopInfo *L : Loops)
6741 L->invalidate();
6742
6743#ifndef NDEBUG
6744 fused->assertOK();
6745#endif
6746 return fused;
6747}
6748
6750 LLVMContext &Ctx = Builder.getContext();
6752 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6753 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6754}
6755
6757 LLVMContext &Ctx = Builder.getContext();
6759 Loop, {
6760 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6761 });
6762}
6763
6764void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6765 Value *IfCond, ValueToValueMapTy &VMap,
6766 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6767 const Twine &NamePrefix) {
6768 Function *F = CanonicalLoop->getFunction();
6769
6770 // We can't do
6771 // if (cond) {
6772 // simd_loop;
6773 // } else {
6774 // non_simd_loop;
6775 // }
6776 // because then the CanonicalLoopInfo would only point to one of the loops:
6777 // leading to other constructs operating on the same loop to malfunction.
6778 // Instead generate
6779 // while (...) {
6780 // if (cond) {
6781 // simd_body;
6782 // } else {
6783 // not_simd_body;
6784 // }
6785 // }
6786 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6787 // body at -O3
6788
6789 // Define where if branch should be inserted
6790 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6791
6792 // Create additional blocks for the if statement
6793 BasicBlock *Cond = SplitBeforeIt->getParent();
6794 llvm::LLVMContext &C = Cond->getContext();
6796 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6798 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6799
6800 // Create if condition branch.
6801 Builder.SetInsertPoint(SplitBeforeIt);
6802 Instruction *BrInstr =
6803 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6804 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6805 // Then block contains branch to omp loop body which needs to be vectorized
6806 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6807 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6808
6809 Builder.SetInsertPoint(ElseBlock);
6810
6811 // Clone loop for the else branch
6813
6814 SmallVector<BasicBlock *, 8> ExistingBlocks;
6815 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6816 ExistingBlocks.push_back(ThenBlock);
6817 ExistingBlocks.append(L->block_begin(), L->block_end());
6818 // Cond is the block that has the if clause condition
6819 // LoopCond is omp_loop.cond
6820 // LoopHeader is omp_loop.header
6821 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6822 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6823 assert(LoopCond && LoopHeader && "Invalid loop structure");
6824 for (BasicBlock *Block : ExistingBlocks) {
6825 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6826 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6827 continue;
6828 }
6829 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6830
6831 // fix name not to be omp.if.then
6832 if (Block == ThenBlock)
6833 NewBB->setName(NamePrefix + ".if.else");
6834
6835 NewBB->moveBefore(CanonicalLoop->getExit());
6836 VMap[Block] = NewBB;
6837 NewBlocks.push_back(NewBB);
6838 }
6839 remapInstructionsInBlocks(NewBlocks, VMap);
6840 Builder.CreateBr(NewBlocks.front());
6841
6842 // The loop latch must have only one predecessor. Currently it is branched to
6843 // from both the 'then' and 'else' branches.
6844 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6845 NamePrefix + ".pre_latch");
6846
6847 // Ensure that the then block is added to the loop so we add the attributes in
6848 // the next step
6849 L->addBasicBlockToLoop(ThenBlock, LI);
6850}
6851
6852unsigned
6854 const StringMap<bool> &Features) {
6855 if (TargetTriple.isX86()) {
6856 if (Features.lookup("avx512f"))
6857 return 512;
6858 else if (Features.lookup("avx"))
6859 return 256;
6860 return 128;
6861 }
6862 if (TargetTriple.isPPC())
6863 return 128;
6864 if (TargetTriple.isWasm())
6865 return 128;
6866 return 0;
6867}
6868
6870 MapVector<Value *, Value *> AlignedVars,
6871 Value *IfCond, OrderKind Order,
6872 ConstantInt *Simdlen, ConstantInt *Safelen) {
6873 LLVMContext &Ctx = Builder.getContext();
6874
6875 Function *F = CanonicalLoop->getFunction();
6876
6877 // TODO: We should not rely on pass manager. Currently we use pass manager
6878 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6879 // object. We should have a method which returns all blocks between
6880 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6882 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6883 FAM.registerPass([]() { return LoopAnalysis(); });
6884 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6885
6886 LoopAnalysis LIA;
6887 LoopInfo &&LI = LIA.run(*F, FAM);
6888
6889 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6890 if (AlignedVars.size()) {
6891 InsertPointTy IP = Builder.saveIP();
6892 for (auto &AlignedItem : AlignedVars) {
6893 Value *AlignedPtr = AlignedItem.first;
6894 Value *Alignment = AlignedItem.second;
6895 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6896 Builder.SetInsertPoint(loadInst->getNextNode());
6897 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6898 Alignment);
6899 }
6900 Builder.restoreIP(IP);
6901 }
6902
6903 if (IfCond) {
6904 ValueToValueMapTy VMap;
6905 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6906 }
6907
6909
6910 // Get the basic blocks from the loop in which memref instructions
6911 // can be found.
6912 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6913 // preferably without running any passes.
6914 for (BasicBlock *Block : L->getBlocks()) {
6915 if (Block == CanonicalLoop->getCond() ||
6916 Block == CanonicalLoop->getHeader())
6917 continue;
6918 Reachable.insert(Block);
6919 }
6920
6921 SmallVector<Metadata *> LoopMDList;
6922
6923 // In presence of finite 'safelen', it may be unsafe to mark all
6924 // the memory instructions parallel, because loop-carried
6925 // dependences of 'safelen' iterations are possible.
6926 // If clause order(concurrent) is specified then the memory instructions
6927 // are marked parallel even if 'safelen' is finite.
6928 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6929 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6930
6931 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6932 // versions so we can't add the loop attributes in that case.
6933 if (IfCond) {
6934 // we can still add llvm.loop.parallel_access
6935 addLoopMetadata(CanonicalLoop, LoopMDList);
6936 return;
6937 }
6938
6939 // Use the above access group metadata to create loop level
6940 // metadata, which should be distinct for each loop.
6941 ConstantAsMetadata *BoolConst =
6943 LoopMDList.push_back(MDNode::get(
6944 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6945
6946 if (Simdlen || Safelen) {
6947 // If both simdlen and safelen clauses are specified, the value of the
6948 // simdlen parameter must be less than or equal to the value of the safelen
6949 // parameter. Therefore, use safelen only in the absence of simdlen.
6950 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6951 LoopMDList.push_back(
6952 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6953 ConstantAsMetadata::get(VectorizeWidth)}));
6954 }
6955
6956 addLoopMetadata(CanonicalLoop, LoopMDList);
6957}
6958
6959/// Create the TargetMachine object to query the backend for optimization
6960/// preferences.
6961///
6962/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6963/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6964/// needed for the LLVM pass pipline. We use some default options to avoid
6965/// having to pass too many settings from the frontend that probably do not
6966/// matter.
6967///
6968/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6969/// method. If we are going to use TargetMachine for more purposes, especially
6970/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6971/// might become be worth requiring front-ends to pass on their TargetMachine,
6972/// or at least cache it between methods. Note that while fontends such as Clang
6973/// have just a single main TargetMachine per translation unit, "target-cpu" and
6974/// "target-features" that determine the TargetMachine are per-function and can
6975/// be overrided using __attribute__((target("OPTIONS"))).
6976static std::unique_ptr<TargetMachine>
6978 Module *M = F->getParent();
6979
6980 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6981 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6982 const llvm::Triple &Triple = M->getTargetTriple();
6983
6984 std::string Error;
6986 if (!TheTarget)
6987 return {};
6988
6990 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6991 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6992 /*CodeModel=*/std::nullopt, OptLevel));
6993}
6994
6995/// Heuristically determine the best-performant unroll factor for \p CLI. This
6996/// depends on the target processor. We are re-using the same heuristics as the
6997/// LoopUnrollPass.
6999 Function *F = CLI->getFunction();
7000
7001 // Assume the user requests the most aggressive unrolling, even if the rest of
7002 // the code is optimized using a lower setting.
7004 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7005
7007 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7008 FAM.registerPass([]() { return AssumptionAnalysis(); });
7009 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7010 FAM.registerPass([]() { return LoopAnalysis(); });
7011 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7012 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7013 TargetIRAnalysis TIRA;
7014 if (TM)
7015 TIRA = TargetIRAnalysis(
7016 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7017 FAM.registerPass([&]() { return TIRA; });
7018
7019 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7021 ScalarEvolution &&SE = SEA.run(*F, FAM);
7023 DominatorTree &&DT = DTA.run(*F, FAM);
7024 LoopAnalysis LIA;
7025 LoopInfo &&LI = LIA.run(*F, FAM);
7027 AssumptionCache &&AC = ACT.run(*F, FAM);
7029
7030 Loop *L = LI.getLoopFor(CLI->getHeader());
7031 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7032
7034 L, SE, TTI,
7035 /*BlockFrequencyInfo=*/nullptr,
7036 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7037 /*UserThreshold=*/std::nullopt,
7038 /*UserCount=*/std::nullopt,
7039 /*UserAllowPartial=*/true,
7040 /*UserAllowRuntime=*/true,
7041 /*UserUpperBound=*/std::nullopt,
7042 /*UserFullUnrollMaxCount=*/std::nullopt);
7043
7044 UP.Force = true;
7045
7046 // Account for additional optimizations taking place before the LoopUnrollPass
7047 // would unroll the loop.
7050
7051 // Use normal unroll factors even if the rest of the code is optimized for
7052 // size.
7055
7056 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7057 << " Threshold=" << UP.Threshold << "\n"
7058 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7059 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7060 << " PartialOptSizeThreshold="
7061 << UP.PartialOptSizeThreshold << "\n");
7062
7063 // Disable peeling.
7066 /*UserAllowPeeling=*/false,
7067 /*UserAllowProfileBasedPeeling=*/false,
7068 /*UnrollingSpecficValues=*/false);
7069
7071 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7072
7073 // Assume that reads and writes to stack variables can be eliminated by
7074 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7075 // size.
7076 for (BasicBlock *BB : L->blocks()) {
7077 for (Instruction &I : *BB) {
7078 Value *Ptr;
7079 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7080 Ptr = Load->getPointerOperand();
7081 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7082 Ptr = Store->getPointerOperand();
7083 } else
7084 continue;
7085
7086 Ptr = Ptr->stripPointerCasts();
7087
7088 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7089 if (Alloca->getParent() == &F->getEntryBlock())
7090 EphValues.insert(&I);
7091 }
7092 }
7093 }
7094
7095 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7096
7097 // Loop is not unrollable if the loop contains certain instructions.
7098 if (!UCE.canUnroll()) {
7099 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7100 return 1;
7101 }
7102
7103 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7104 << "\n");
7105
7106 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7107 // be able to use it.
7108 int TripCount = 0;
7109 int MaxTripCount = 0;
7110 bool MaxOrZero = false;
7111 unsigned TripMultiple = 0;
7112
7113 bool UseUpperBound = false;
7114 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7115 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
7116 UseUpperBound);
7117 unsigned Factor = UP.Count;
7118 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7119
7120 // This function returns 1 to signal to not unroll a loop.
7121 if (Factor == 0)
7122 return 1;
7123 return Factor;
7124}
7125
7127 int32_t Factor,
7128 CanonicalLoopInfo **UnrolledCLI) {
7129 assert(Factor >= 0 && "Unroll factor must not be negative");
7130
7131 Function *F = Loop->getFunction();
7132 LLVMContext &Ctx = F->getContext();
7133
7134 // If the unrolled loop is not used for another loop-associated directive, it
7135 // is sufficient to add metadata for the LoopUnrollPass.
7136 if (!UnrolledCLI) {
7137 SmallVector<Metadata *, 2> LoopMetadata;
7138 LoopMetadata.push_back(
7139 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7140
7141 if (Factor >= 1) {
7143 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7144 LoopMetadata.push_back(MDNode::get(
7145 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7146 }
7147
7148 addLoopMetadata(Loop, LoopMetadata);
7149 return;
7150 }
7151
7152 // Heuristically determine the unroll factor.
7153 if (Factor == 0)
7155
7156 // No change required with unroll factor 1.
7157 if (Factor == 1) {
7158 *UnrolledCLI = Loop;
7159 return;
7160 }
7161
7162 assert(Factor >= 2 &&
7163 "unrolling only makes sense with a factor of 2 or larger");
7164
7165 Type *IndVarTy = Loop->getIndVarType();
7166
7167 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7168 // unroll the inner loop.
7169 Value *FactorVal =
7170 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7171 /*isSigned=*/false));
7172 std::vector<CanonicalLoopInfo *> LoopNest =
7173 tileLoops(DL, {Loop}, {FactorVal});
7174 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7175 *UnrolledCLI = LoopNest[0];
7176 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7177
7178 // LoopUnrollPass can only fully unroll loops with constant trip count.
7179 // Unroll by the unroll factor with a fallback epilog for the remainder
7180 // iterations if necessary.
7182 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7184 InnerLoop,
7185 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7187 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7188
7189#ifndef NDEBUG
7190 (*UnrolledCLI)->assertOK();
7191#endif
7192}
7193
7196 llvm::Value *BufSize, llvm::Value *CpyBuf,
7197 llvm::Value *CpyFn, llvm::Value *DidIt) {
7198 if (!updateToLocation(Loc))
7199 return Loc.IP;
7200
7201 uint32_t SrcLocStrSize;
7202 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7203 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7204 Value *ThreadId = getOrCreateThreadID(Ident);
7205
7206 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7207
7208 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7209
7210 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7211 createRuntimeFunctionCall(Fn, Args);
7212
7213 return Builder.saveIP();
7214}
7215
7217 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7218 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7220
7221 if (!updateToLocation(Loc))
7222 return Loc.IP;
7223
7224 // If needed allocate and initialize `DidIt` with 0.
7225 // DidIt: flag variable: 1=single thread; 0=not single thread.
7226 llvm::Value *DidIt = nullptr;
7227 if (!CPVars.empty()) {
7228 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7229 Builder.CreateStore(Builder.getInt32(0), DidIt);
7230 }
7231
7232 Directive OMPD = Directive::OMPD_single;
7233 uint32_t SrcLocStrSize;
7234 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7235 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7236 Value *ThreadId = getOrCreateThreadID(Ident);
7237 Value *Args[] = {Ident, ThreadId};
7238
7239 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7240 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7241
7242 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7243 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7244
7245 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7246 if (Error Err = FiniCB(IP))
7247 return Err;
7248
7249 // The thread that executes the single region must set `DidIt` to 1.
7250 // This is used by __kmpc_copyprivate, to know if the caller is the
7251 // single thread or not.
7252 if (DidIt)
7253 Builder.CreateStore(Builder.getInt32(1), DidIt);
7254
7255 return Error::success();
7256 };
7257
7258 // generates the following:
7259 // if (__kmpc_single()) {
7260 // .... single region ...
7261 // __kmpc_end_single
7262 // }
7263 // __kmpc_copyprivate
7264 // __kmpc_barrier
7265
7266 InsertPointOrErrorTy AfterIP =
7267 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7268 /*Conditional*/ true,
7269 /*hasFinalize*/ true);
7270 if (!AfterIP)
7271 return AfterIP.takeError();
7272
7273 if (DidIt) {
7274 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7275 // NOTE BufSize is currently unused, so just pass 0.
7277 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7278 CPFuncs[I], DidIt);
7279 // NOTE __kmpc_copyprivate already inserts a barrier
7280 } else if (!IsNowait) {
7281 InsertPointOrErrorTy AfterIP =
7283 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7284 /* CheckCancelFlag */ false);
7285 if (!AfterIP)
7286 return AfterIP.takeError();
7287 }
7288 return Builder.saveIP();
7289}
7290
7292 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7293 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7294
7295 if (!updateToLocation(Loc))
7296 return Loc.IP;
7297
7298 Directive OMPD = Directive::OMPD_critical;
7299 uint32_t SrcLocStrSize;
7300 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7301 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7302 Value *ThreadId = getOrCreateThreadID(Ident);
7303 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7304 Value *Args[] = {Ident, ThreadId, LockVar};
7305
7306 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7307 Function *RTFn = nullptr;
7308 if (HintInst) {
7309 // Add Hint to entry Args and create call
7310 EnterArgs.push_back(HintInst);
7311 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7312 } else {
7313 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7314 }
7315 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7316
7317 Function *ExitRTLFn =
7318 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7319 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7320
7321 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7322 /*Conditional*/ false, /*hasFinalize*/ true);
7323}
7324
7327 InsertPointTy AllocaIP, unsigned NumLoops,
7328 ArrayRef<llvm::Value *> StoreValues,
7329 const Twine &Name, bool IsDependSource) {
7330 assert(
7331 llvm::all_of(StoreValues,
7332 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7333 "OpenMP runtime requires depend vec with i64 type");
7334
7335 if (!updateToLocation(Loc))
7336 return Loc.IP;
7337
7338 // Allocate space for vector and generate alloc instruction.
7339 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7340 Builder.restoreIP(AllocaIP);
7341 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7342 ArgsBase->setAlignment(Align(8));
7344
7345 // Store the index value with offset in depend vector.
7346 for (unsigned I = 0; I < NumLoops; ++I) {
7347 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7348 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7349 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7350 STInst->setAlignment(Align(8));
7351 }
7352
7353 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7354 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7355
7356 uint32_t SrcLocStrSize;
7357 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7358 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7359 Value *ThreadId = getOrCreateThreadID(Ident);
7360 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7361
7362 Function *RTLFn = nullptr;
7363 if (IsDependSource)
7364 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7365 else
7366 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7367 createRuntimeFunctionCall(RTLFn, Args);
7368
7369 return Builder.saveIP();
7370}
7371
7373 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7374 FinalizeCallbackTy FiniCB, bool IsThreads) {
7375 if (!updateToLocation(Loc))
7376 return Loc.IP;
7377
7378 Directive OMPD = Directive::OMPD_ordered;
7379 Instruction *EntryCall = nullptr;
7380 Instruction *ExitCall = nullptr;
7381
7382 if (IsThreads) {
7383 uint32_t SrcLocStrSize;
7384 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7385 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7386 Value *ThreadId = getOrCreateThreadID(Ident);
7387 Value *Args[] = {Ident, ThreadId};
7388
7389 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7390 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7391
7392 Function *ExitRTLFn =
7393 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7394 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7395 }
7396
7397 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7398 /*Conditional*/ false, /*hasFinalize*/ true);
7399}
7400
7401OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7402 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7403 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7404 bool HasFinalize, bool IsCancellable) {
7405
7406 if (HasFinalize)
7407 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7408
7409 // Create inlined region's entry and body blocks, in preparation
7410 // for conditional creation
7411 BasicBlock *EntryBB = Builder.GetInsertBlock();
7412 Instruction *SplitPos = EntryBB->getTerminator();
7413 if (!isa_and_nonnull<BranchInst>(SplitPos))
7414 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7415 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7416 BasicBlock *FiniBB =
7417 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7418
7419 Builder.SetInsertPoint(EntryBB->getTerminator());
7420 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7421
7422 // generate body
7423 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7424 /* CodeGenIP */ Builder.saveIP()))
7425 return Err;
7426
7427 // emit exit call and do any needed finalization.
7428 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7429 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7430 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7431 "Unexpected control flow graph state!!");
7432 InsertPointOrErrorTy AfterIP =
7433 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7434 if (!AfterIP)
7435 return AfterIP.takeError();
7436
7437 // If we are skipping the region of a non conditional, remove the exit
7438 // block, and clear the builder's insertion point.
7439 assert(SplitPos->getParent() == ExitBB &&
7440 "Unexpected Insertion point location!");
7441 auto merged = MergeBlockIntoPredecessor(ExitBB);
7442 BasicBlock *ExitPredBB = SplitPos->getParent();
7443 auto InsertBB = merged ? ExitPredBB : ExitBB;
7444 if (!isa_and_nonnull<BranchInst>(SplitPos))
7445 SplitPos->eraseFromParent();
7446 Builder.SetInsertPoint(InsertBB);
7447
7448 return Builder.saveIP();
7449}
7450
7451OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7452 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7453 // if nothing to do, Return current insertion point.
7454 if (!Conditional || !EntryCall)
7455 return Builder.saveIP();
7456
7457 BasicBlock *EntryBB = Builder.GetInsertBlock();
7458 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7459 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7460 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7461
7462 // Emit thenBB and set the Builder's insertion point there for
7463 // body generation next. Place the block after the current block.
7464 Function *CurFn = EntryBB->getParent();
7465 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7466
7467 // Move Entry branch to end of ThenBB, and replace with conditional
7468 // branch (If-stmt)
7469 Instruction *EntryBBTI = EntryBB->getTerminator();
7470 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7471 EntryBBTI->removeFromParent();
7472 Builder.SetInsertPoint(UI);
7473 Builder.Insert(EntryBBTI);
7474 UI->eraseFromParent();
7475 Builder.SetInsertPoint(ThenBB->getTerminator());
7476
7477 // return an insertion point to ExitBB.
7478 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7479}
7480
7481OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7482 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7483 bool HasFinalize) {
7484
7485 Builder.restoreIP(FinIP);
7486
7487 // If there is finalization to do, emit it before the exit call
7488 if (HasFinalize) {
7489 assert(!FinalizationStack.empty() &&
7490 "Unexpected finalization stack state!");
7491
7492 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7493 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7494
7495 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7496 return std::move(Err);
7497
7498 // Exit condition: insertion point is before the terminator of the new Fini
7499 // block
7500 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7501 }
7502
7503 if (!ExitCall)
7504 return Builder.saveIP();
7505
7506 // place the Exitcall as last instruction before Finalization block terminator
7507 ExitCall->removeFromParent();
7508 Builder.Insert(ExitCall);
7509
7510 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7511 ExitCall->getIterator());
7512}
7513
7515 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7516 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7517 if (!IP.isSet())
7518 return IP;
7519
7521
7522 // creates the following CFG structure
7523 // OMP_Entry : (MasterAddr != PrivateAddr)?
7524 // F T
7525 // | \
7526 // | copin.not.master
7527 // | /
7528 // v /
7529 // copyin.not.master.end
7530 // |
7531 // v
7532 // OMP.Entry.Next
7533
7534 BasicBlock *OMP_Entry = IP.getBlock();
7535 Function *CurFn = OMP_Entry->getParent();
7536 BasicBlock *CopyBegin =
7537 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7538 BasicBlock *CopyEnd = nullptr;
7539
7540 // If entry block is terminated, split to preserve the branch to following
7541 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7542 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
7543 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7544 "copyin.not.master.end");
7545 OMP_Entry->getTerminator()->eraseFromParent();
7546 } else {
7547 CopyEnd =
7548 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7549 }
7550
7551 Builder.SetInsertPoint(OMP_Entry);
7552 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7553 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7554 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7555 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7556
7557 Builder.SetInsertPoint(CopyBegin);
7558 if (BranchtoEnd)
7559 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7560
7561 return Builder.saveIP();
7562}
7563
7565 Value *Size, Value *Allocator,
7566 std::string Name) {
7569
7570 uint32_t SrcLocStrSize;
7571 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7572 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7573 Value *ThreadId = getOrCreateThreadID(Ident);
7574 Value *Args[] = {ThreadId, Size, Allocator};
7575
7576 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7577
7578 return createRuntimeFunctionCall(Fn, Args, Name);
7579}
7580
7582 Value *Addr, Value *Allocator,
7583 std::string Name) {
7586
7587 uint32_t SrcLocStrSize;
7588 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7589 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7590 Value *ThreadId = getOrCreateThreadID(Ident);
7591 Value *Args[] = {ThreadId, Addr, Allocator};
7592 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7593 return createRuntimeFunctionCall(Fn, Args, Name);
7594}
7595
7597 const LocationDescription &Loc, Value *InteropVar,
7598 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7599 Value *DependenceAddress, bool HaveNowaitClause) {
7602
7603 uint32_t SrcLocStrSize;
7604 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7605 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7606 Value *ThreadId = getOrCreateThreadID(Ident);
7607 if (Device == nullptr)
7608 Device = Constant::getAllOnesValue(Int32);
7609 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7610 if (NumDependences == nullptr) {
7611 NumDependences = ConstantInt::get(Int32, 0);
7612 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7613 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7614 }
7615 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7616 Value *Args[] = {
7617 Ident, ThreadId, InteropVar, InteropTypeVal,
7618 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7619
7620 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7621
7622 return createRuntimeFunctionCall(Fn, Args);
7623}
7624
7626 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7627 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7630
7631 uint32_t SrcLocStrSize;
7632 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7633 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7634 Value *ThreadId = getOrCreateThreadID(Ident);
7635 if (Device == nullptr)
7636 Device = Constant::getAllOnesValue(Int32);
7637 if (NumDependences == nullptr) {
7638 NumDependences = ConstantInt::get(Int32, 0);
7639 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7640 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7641 }
7642 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7643 Value *Args[] = {
7644 Ident, ThreadId, InteropVar, Device,
7645 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7646
7647 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7648
7649 return createRuntimeFunctionCall(Fn, Args);
7650}
7651
7653 Value *InteropVar, Value *Device,
7654 Value *NumDependences,
7655 Value *DependenceAddress,
7656 bool HaveNowaitClause) {
7659 uint32_t SrcLocStrSize;
7660 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7661 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7662 Value *ThreadId = getOrCreateThreadID(Ident);
7663 if (Device == nullptr)
7664 Device = Constant::getAllOnesValue(Int32);
7665 if (NumDependences == nullptr) {
7666 NumDependences = ConstantInt::get(Int32, 0);
7667 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7668 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7669 }
7670 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7671 Value *Args[] = {
7672 Ident, ThreadId, InteropVar, Device,
7673 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7674
7675 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7676
7677 return createRuntimeFunctionCall(Fn, Args);
7678}
7679
7682 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7685
7686 uint32_t SrcLocStrSize;
7687 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7688 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7689 Value *ThreadId = getOrCreateThreadID(Ident);
7690 Constant *ThreadPrivateCache =
7691 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7692 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7693
7694 Function *Fn =
7695 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7696
7697 return createRuntimeFunctionCall(Fn, Args);
7698}
7699
7701 const LocationDescription &Loc,
7703 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7704 "expected num_threads and num_teams to be specified");
7705
7706 if (!updateToLocation(Loc))
7707 return Loc.IP;
7708
7709 uint32_t SrcLocStrSize;
7710 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7711 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7712 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7713 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7714 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7715 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7716 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7717
7718 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7719 Function *Kernel = DebugKernelWrapper;
7720
7721 // We need to strip the debug prefix to get the correct kernel name.
7722 StringRef KernelName = Kernel->getName();
7723 const std::string DebugPrefix = "_debug__";
7724 if (KernelName.ends_with(DebugPrefix)) {
7725 KernelName = KernelName.drop_back(DebugPrefix.length());
7726 Kernel = M.getFunction(KernelName);
7727 assert(Kernel && "Expected the real kernel to exist");
7728 }
7729
7730 // Manifest the launch configuration in the metadata matching the kernel
7731 // environment.
7732 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7733 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7734
7735 // If MaxThreads not set, select the maximum between the default workgroup
7736 // size and the MinThreads value.
7737 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7738 if (MaxThreadsVal < 0)
7739 MaxThreadsVal = std::max(
7740 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7741
7742 if (MaxThreadsVal > 0)
7743 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7744
7745 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7746 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7747 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7748 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7749 Constant *ReductionDataSize =
7750 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7751 Constant *ReductionBufferLength =
7752 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7753
7755 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7756 const DataLayout &DL = Fn->getDataLayout();
7757
7758 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7759 Constant *DynamicEnvironmentInitializer =
7760 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7761 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7762 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7763 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7764 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7765 DL.getDefaultGlobalsAddressSpace());
7766 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7767
7768 Constant *DynamicEnvironment =
7769 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7770 ? DynamicEnvironmentGV
7771 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7772 DynamicEnvironmentPtr);
7773
7774 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7775 ConfigurationEnvironment, {
7776 UseGenericStateMachineVal,
7777 MayUseNestedParallelismVal,
7778 IsSPMDVal,
7779 MinThreads,
7780 MaxThreads,
7781 MinTeams,
7782 MaxTeams,
7783 ReductionDataSize,
7784 ReductionBufferLength,
7785 });
7786 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7787 KernelEnvironment, {
7788 ConfigurationEnvironmentInitializer,
7789 Ident,
7790 DynamicEnvironment,
7791 });
7792 std::string KernelEnvironmentName =
7793 (KernelName + "_kernel_environment").str();
7794 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7795 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7796 KernelEnvironmentInitializer, KernelEnvironmentName,
7797 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7798 DL.getDefaultGlobalsAddressSpace());
7799 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7800
7801 Constant *KernelEnvironment =
7802 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7803 ? KernelEnvironmentGV
7804 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7805 KernelEnvironmentPtr);
7806 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7807 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7808 KernelLaunchEnvironment =
7809 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7810 ? KernelLaunchEnvironment
7811 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7812 KernelLaunchEnvParamTy);
7813 CallInst *ThreadKind = createRuntimeFunctionCall(
7814 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7815
7816 Value *ExecUserCode = Builder.CreateICmpEQ(
7817 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7818 "exec_user_code");
7819
7820 // ThreadKind = __kmpc_target_init(...)
7821 // if (ThreadKind == -1)
7822 // user_code
7823 // else
7824 // return;
7825
7826 auto *UI = Builder.CreateUnreachable();
7827 BasicBlock *CheckBB = UI->getParent();
7828 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7829
7830 BasicBlock *WorkerExitBB = BasicBlock::Create(
7831 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7832 Builder.SetInsertPoint(WorkerExitBB);
7833 Builder.CreateRetVoid();
7834
7835 auto *CheckBBTI = CheckBB->getTerminator();
7836 Builder.SetInsertPoint(CheckBBTI);
7837 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7838
7839 CheckBBTI->eraseFromParent();
7840 UI->eraseFromParent();
7841
7842 // Continue in the "user_code" block, see diagram above and in
7843 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7844 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7845}
7846
7848 int32_t TeamsReductionDataSize,
7849 int32_t TeamsReductionBufferLength) {
7850 if (!updateToLocation(Loc))
7851 return;
7852
7854 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7855
7857
7858 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7859 return;
7860
7861 Function *Kernel = Builder.GetInsertBlock()->getParent();
7862 // We need to strip the debug prefix to get the correct kernel name.
7863 StringRef KernelName = Kernel->getName();
7864 const std::string DebugPrefix = "_debug__";
7865 if (KernelName.ends_with(DebugPrefix))
7866 KernelName = KernelName.drop_back(DebugPrefix.length());
7867 auto *KernelEnvironmentGV =
7868 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7869 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7870 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7871 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7872 KernelEnvironmentInitializer,
7873 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7874 NewInitializer = ConstantFoldInsertValueInstruction(
7875 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7876 {0, 8});
7877 KernelEnvironmentGV->setInitializer(NewInitializer);
7878}
7879
7880static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7881 bool Min) {
7882 if (Kernel.hasFnAttribute(Name)) {
7883 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7884 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7885 }
7886 Kernel.addFnAttr(Name, llvm::utostr(Value));
7887}
7888
7889std::pair<int32_t, int32_t>
7891 int32_t ThreadLimit =
7892 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7893
7894 if (T.isAMDGPU()) {
7895 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7896 if (!Attr.isValid() || !Attr.isStringAttribute())
7897 return {0, ThreadLimit};
7898 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7899 int32_t LB, UB;
7900 if (!llvm::to_integer(UBStr, UB, 10))
7901 return {0, ThreadLimit};
7902 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7903 if (!llvm::to_integer(LBStr, LB, 10))
7904 return {0, UB};
7905 return {LB, UB};
7906 }
7907
7908 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7909 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7910 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7911 }
7912 return {0, ThreadLimit};
7913}
7914
7916 Function &Kernel, int32_t LB,
7917 int32_t UB) {
7918 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7919
7920 if (T.isAMDGPU()) {
7921 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7922 llvm::utostr(LB) + "," + llvm::utostr(UB));
7923 return;
7924 }
7925
7926 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7927}
7928
7929std::pair<int32_t, int32_t>
7931 // TODO: Read from backend annotations if available.
7932 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7933}
7934
7936 int32_t LB, int32_t UB) {
7937 if (T.isNVPTX())
7938 if (UB > 0)
7939 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7940 if (T.isAMDGPU())
7941 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7942
7943 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7944}
7945
7946void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7947 Function *OutlinedFn) {
7948 if (Config.isTargetDevice()) {
7950 // TODO: Determine if DSO local can be set to true.
7951 OutlinedFn->setDSOLocal(false);
7953 if (T.isAMDGCN())
7955 else if (T.isNVPTX())
7957 else if (T.isSPIRV())
7959 }
7960}
7961
7962Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7963 StringRef EntryFnIDName) {
7964 if (Config.isTargetDevice()) {
7965 assert(OutlinedFn && "The outlined function must exist if embedded");
7966 return OutlinedFn;
7967 }
7968
7969 return new GlobalVariable(
7970 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7971 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7972}
7973
7974Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7975 StringRef EntryFnName) {
7976 if (OutlinedFn)
7977 return OutlinedFn;
7978
7979 assert(!M.getGlobalVariable(EntryFnName, true) &&
7980 "Named kernel already exists?");
7981 return new GlobalVariable(
7982 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7983 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7984}
7985
7987 TargetRegionEntryInfo &EntryInfo,
7988 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7989 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7990
7991 SmallString<64> EntryFnName;
7992 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7993
7994 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7995 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7996 if (!CBResult)
7997 return CBResult.takeError();
7998 OutlinedFn = *CBResult;
7999 } else {
8000 OutlinedFn = nullptr;
8001 }
8002
8003 // If this target outline function is not an offload entry, we don't need to
8004 // register it. This may be in the case of a false if clause, or if there are
8005 // no OpenMP targets.
8006 if (!IsOffloadEntry)
8007 return Error::success();
8008
8009 std::string EntryFnIDName =
8010 Config.isTargetDevice()
8011 ? std::string(EntryFnName)
8012 : createPlatformSpecificName({EntryFnName, "region_id"});
8013
8014 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8015 EntryFnName, EntryFnIDName);
8016 return Error::success();
8017}
8018
8020 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8021 StringRef EntryFnName, StringRef EntryFnIDName) {
8022 if (OutlinedFn)
8023 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8024 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8025 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8026 OffloadInfoManager.registerTargetRegionEntryInfo(
8027 EntryInfo, EntryAddr, OutlinedFnID,
8029 return OutlinedFnID;
8030}
8031
8033 const LocationDescription &Loc, InsertPointTy AllocaIP,
8034 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8035 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8036 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8038 BodyGenTy BodyGenType)>
8039 BodyGenCB,
8040 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8041 if (!updateToLocation(Loc))
8042 return InsertPointTy();
8043
8044 Builder.restoreIP(CodeGenIP);
8045
8046 bool IsStandAlone = !BodyGenCB;
8047 MapInfosTy *MapInfo;
8048 // Generate the code for the opening of the data environment. Capture all the
8049 // arguments of the runtime call by reference because they are used in the
8050 // closing of the region.
8051 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8052 InsertPointTy CodeGenIP) -> Error {
8053 MapInfo = &GenMapInfoCB(Builder.saveIP());
8054 if (Error Err = emitOffloadingArrays(
8055 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8056 /*IsNonContiguous=*/true, DeviceAddrCB))
8057 return Err;
8058
8059 TargetDataRTArgs RTArgs;
8061
8062 // Emit the number of elements in the offloading arrays.
8063 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8064
8065 // Source location for the ident struct
8066 if (!SrcLocInfo) {
8067 uint32_t SrcLocStrSize;
8068 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8069 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8070 }
8071
8072 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8073 SrcLocInfo, DeviceID,
8074 PointerNum, RTArgs.BasePointersArray,
8075 RTArgs.PointersArray, RTArgs.SizesArray,
8076 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8077 RTArgs.MappersArray};
8078
8079 if (IsStandAlone) {
8080 assert(MapperFunc && "MapperFunc missing for standalone target data");
8081
8082 auto TaskBodyCB = [&](Value *, Value *,
8084 if (Info.HasNoWait) {
8085 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8089 }
8090
8092 OffloadingArgs);
8093
8094 if (Info.HasNoWait) {
8095 BasicBlock *OffloadContBlock =
8096 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8097 Function *CurFn = Builder.GetInsertBlock()->getParent();
8098 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8099 Builder.restoreIP(Builder.saveIP());
8100 }
8101 return Error::success();
8102 };
8103
8104 bool RequiresOuterTargetTask = Info.HasNoWait;
8105 if (!RequiresOuterTargetTask)
8106 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8107 /*TargetTaskAllocaIP=*/{}));
8108 else
8109 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8110 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8111 } else {
8112 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8113 omp::OMPRTL___tgt_target_data_begin_mapper);
8114
8115 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8116
8117 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8118 if (isa<AllocaInst>(DeviceMap.second.second)) {
8119 auto *LI =
8120 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8121 Builder.CreateStore(LI, DeviceMap.second.second);
8122 }
8123 }
8124
8125 // If device pointer privatization is required, emit the body of the
8126 // region here. It will have to be duplicated: with and without
8127 // privatization.
8128 InsertPointOrErrorTy AfterIP =
8129 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8130 if (!AfterIP)
8131 return AfterIP.takeError();
8132 Builder.restoreIP(*AfterIP);
8133 }
8134 return Error::success();
8135 };
8136
8137 // If we need device pointer privatization, we need to emit the body of the
8138 // region with no privatization in the 'else' branch of the conditional.
8139 // Otherwise, we don't have to do anything.
8140 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8141 InsertPointTy CodeGenIP) -> Error {
8142 InsertPointOrErrorTy AfterIP =
8143 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8144 if (!AfterIP)
8145 return AfterIP.takeError();
8146 Builder.restoreIP(*AfterIP);
8147 return Error::success();
8148 };
8149
8150 // Generate code for the closing of the data region.
8151 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8152 TargetDataRTArgs RTArgs;
8153 Info.EmitDebug = !MapInfo->Names.empty();
8154 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8155
8156 // Emit the number of elements in the offloading arrays.
8157 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8158
8159 // Source location for the ident struct
8160 if (!SrcLocInfo) {
8161 uint32_t SrcLocStrSize;
8162 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8163 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8164 }
8165
8166 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8167 PointerNum, RTArgs.BasePointersArray,
8168 RTArgs.PointersArray, RTArgs.SizesArray,
8169 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8170 RTArgs.MappersArray};
8171 Function *EndMapperFunc =
8172 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8173
8174 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8175 return Error::success();
8176 };
8177
8178 // We don't have to do anything to close the region if the if clause evaluates
8179 // to false.
8180 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8181 return Error::success();
8182 };
8183
8184 Error Err = [&]() -> Error {
8185 if (BodyGenCB) {
8186 Error Err = [&]() {
8187 if (IfCond)
8188 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8189 return BeginThenGen(AllocaIP, Builder.saveIP());
8190 }();
8191
8192 if (Err)
8193 return Err;
8194
8195 // If we don't require privatization of device pointers, we emit the body
8196 // in between the runtime calls. This avoids duplicating the body code.
8197 InsertPointOrErrorTy AfterIP =
8198 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8199 if (!AfterIP)
8200 return AfterIP.takeError();
8201 restoreIPandDebugLoc(Builder, *AfterIP);
8202
8203 if (IfCond)
8204 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8205 return EndThenGen(AllocaIP, Builder.saveIP());
8206 }
8207 if (IfCond)
8208 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8209 return BeginThenGen(AllocaIP, Builder.saveIP());
8210 }();
8211
8212 if (Err)
8213 return Err;
8214
8215 return Builder.saveIP();
8216}
8217
8220 bool IsGPUDistribute) {
8221 assert((IVSize == 32 || IVSize == 64) &&
8222 "IV size is not compatible with the omp runtime");
8223 RuntimeFunction Name;
8224 if (IsGPUDistribute)
8225 Name = IVSize == 32
8226 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8227 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8228 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8229 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8230 else
8231 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8232 : omp::OMPRTL___kmpc_for_static_init_4u)
8233 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8234 : omp::OMPRTL___kmpc_for_static_init_8u);
8235
8236 return getOrCreateRuntimeFunction(M, Name);
8237}
8238
8240 bool IVSigned) {
8241 assert((IVSize == 32 || IVSize == 64) &&
8242 "IV size is not compatible with the omp runtime");
8243 RuntimeFunction Name = IVSize == 32
8244 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8245 : omp::OMPRTL___kmpc_dispatch_init_4u)
8246 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8247 : omp::OMPRTL___kmpc_dispatch_init_8u);
8248
8249 return getOrCreateRuntimeFunction(M, Name);
8250}
8251
8253 bool IVSigned) {
8254 assert((IVSize == 32 || IVSize == 64) &&
8255 "IV size is not compatible with the omp runtime");
8256 RuntimeFunction Name = IVSize == 32
8257 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8258 : omp::OMPRTL___kmpc_dispatch_next_4u)
8259 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8260 : omp::OMPRTL___kmpc_dispatch_next_8u);
8261
8262 return getOrCreateRuntimeFunction(M, Name);
8263}
8264
8266 bool IVSigned) {
8267 assert((IVSize == 32 || IVSize == 64) &&
8268 "IV size is not compatible with the omp runtime");
8269 RuntimeFunction Name = IVSize == 32
8270 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8271 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8272 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8273 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8274
8275 return getOrCreateRuntimeFunction(M, Name);
8276}
8277
8279 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8280}
8281
8283 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8284 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8285
8286 DISubprogram *NewSP = Func->getSubprogram();
8287 if (!NewSP)
8288 return;
8289
8291
8292 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8293 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8294 // Only use cached variable if the arg number matches. This is important
8295 // so that DIVariable created for privatized variables are not discarded.
8296 if (NewVar && (arg == NewVar->getArg()))
8297 return NewVar;
8298
8300 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8301 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8302 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8303 return NewVar;
8304 };
8305
8306 auto UpdateDebugRecord = [&](auto *DR) {
8307 DILocalVariable *OldVar = DR->getVariable();
8308 unsigned ArgNo = 0;
8309 for (auto Loc : DR->location_ops()) {
8310 auto Iter = ValueReplacementMap.find(Loc);
8311 if (Iter != ValueReplacementMap.end()) {
8312 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8313 ArgNo = std::get<1>(Iter->second) + 1;
8314 }
8315 }
8316 if (ArgNo != 0)
8317 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8318 };
8319
8320 // The location and scope of variable intrinsics and records still point to
8321 // the parent function of the target region. Update them.
8322 for (Instruction &I : instructions(Func)) {
8324 "Unexpected debug intrinsic");
8325 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
8326 UpdateDebugRecord(&DVR);
8327 }
8328 // An extra argument is passed to the device. Create the debug data for it.
8329 if (OMPBuilder.Config.isTargetDevice()) {
8330 DICompileUnit *CU = NewSP->getUnit();
8331 Module *M = Func->getParent();
8332 DIBuilder DB(*M, true, CU);
8333 DIType *VoidPtrTy =
8334 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8335 DILocalVariable *Var = DB.createParameterVariable(
8336 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
8337 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8338 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8339 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
8340 &(*Func->begin()));
8341 }
8342}
8343
8345 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8346 return cast<Operator>(V)->getOperand(0);
8347 return V;
8348}
8349
8351 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8353 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8356 SmallVector<Type *> ParameterTypes;
8357 if (OMPBuilder.Config.isTargetDevice()) {
8358 // Add the "implicit" runtime argument we use to provide launch specific
8359 // information for target devices.
8360 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
8361 ParameterTypes.push_back(Int8PtrTy);
8362
8363 // All parameters to target devices are passed as pointers
8364 // or i64. This assumes 64-bit address spaces/pointers.
8365 for (auto &Arg : Inputs)
8366 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8367 ? Arg->getType()
8368 : Type::getInt64Ty(Builder.getContext()));
8369 } else {
8370 for (auto &Arg : Inputs)
8371 ParameterTypes.push_back(Arg->getType());
8372 }
8373
8374 auto BB = Builder.GetInsertBlock();
8375 auto M = BB->getModule();
8376 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8377 /*isVarArg*/ false);
8378 auto Func =
8379 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8380
8381 // Forward target-cpu and target-features function attributes from the
8382 // original function to the new outlined function.
8383 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8384
8385 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8386 if (TargetCpuAttr.isStringAttribute())
8387 Func->addFnAttr(TargetCpuAttr);
8388
8389 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8390 if (TargetFeaturesAttr.isStringAttribute())
8391 Func->addFnAttr(TargetFeaturesAttr);
8392
8393 if (OMPBuilder.Config.isTargetDevice()) {
8394 Value *ExecMode =
8395 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8396 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8397 }
8398
8399 // Save insert point.
8400 IRBuilder<>::InsertPointGuard IPG(Builder);
8401 // We will generate the entries in the outlined function but the debug
8402 // location may still be pointing to the parent function. Reset it now.
8403 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8404
8405 // Generate the region into the function.
8406 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8407 Builder.SetInsertPoint(EntryBB);
8408
8409 // Insert target init call in the device compilation pass.
8410 if (OMPBuilder.Config.isTargetDevice())
8411 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8412
8413 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8414
8415 // As we embed the user code in the middle of our target region after we
8416 // generate entry code, we must move what allocas we can into the entry
8417 // block to avoid possible breaking optimisations for device
8418 if (OMPBuilder.Config.isTargetDevice())
8420
8421 // Insert target deinit call in the device compilation pass.
8422 BasicBlock *OutlinedBodyBB =
8423 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8425 Builder.saveIP(),
8426 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8427 if (!AfterIP)
8428 return AfterIP.takeError();
8429 Builder.restoreIP(*AfterIP);
8430 if (OMPBuilder.Config.isTargetDevice())
8431 OMPBuilder.createTargetDeinit(Builder);
8432
8433 // Insert return instruction.
8434 Builder.CreateRetVoid();
8435
8436 // New Alloca IP at entry point of created device function.
8437 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8438 auto AllocaIP = Builder.saveIP();
8439
8440 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8441
8442 // Skip the artificial dyn_ptr on the device.
8443 const auto &ArgRange =
8444 OMPBuilder.Config.isTargetDevice()
8445 ? make_range(Func->arg_begin() + 1, Func->arg_end())
8446 : Func->args();
8447
8449
8450 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8451 // Things like GEP's can come in the form of Constants. Constants and
8452 // ConstantExpr's do not have access to the knowledge of what they're
8453 // contained in, so we must dig a little to find an instruction so we
8454 // can tell if they're used inside of the function we're outlining. We
8455 // also replace the original constant expression with a new instruction
8456 // equivalent; an instruction as it allows easy modification in the
8457 // following loop, as we can now know the constant (instruction) is
8458 // owned by our target function and replaceUsesOfWith can now be invoked
8459 // on it (cannot do this with constants it seems). A brand new one also
8460 // allows us to be cautious as it is perhaps possible the old expression
8461 // was used inside of the function but exists and is used externally
8462 // (unlikely by the nature of a Constant, but still).
8463 // NOTE: We cannot remove dead constants that have been rewritten to
8464 // instructions at this stage, we run the risk of breaking later lowering
8465 // by doing so as we could still be in the process of lowering the module
8466 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8467 // constants we have created rewritten versions of.
8468 if (auto *Const = dyn_cast<Constant>(Input))
8469 convertUsersOfConstantsToInstructions(Const, Func, false);
8470
8471 // Collect users before iterating over them to avoid invalidating the
8472 // iteration in case a user uses Input more than once (e.g. a call
8473 // instruction).
8474 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8475 // Collect all the instructions
8477 if (auto *Instr = dyn_cast<Instruction>(User))
8478 if (Instr->getFunction() == Func)
8479 Instr->replaceUsesOfWith(Input, InputCopy);
8480 };
8481
8482 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8483
8484 // Rewrite uses of input valus to parameters.
8485 for (auto InArg : zip(Inputs, ArgRange)) {
8486 Value *Input = std::get<0>(InArg);
8487 Argument &Arg = std::get<1>(InArg);
8488 Value *InputCopy = nullptr;
8489
8491 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8492 if (!AfterIP)
8493 return AfterIP.takeError();
8494 Builder.restoreIP(*AfterIP);
8495 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8496
8497 // In certain cases a Global may be set up for replacement, however, this
8498 // Global may be used in multiple arguments to the kernel, just segmented
8499 // apart, for example, if we have a global array, that is sectioned into
8500 // multiple mappings (technically not legal in OpenMP, but there is a case
8501 // in Fortran for Common Blocks where this is neccesary), we will end up
8502 // with GEP's into this array inside the kernel, that refer to the Global
8503 // but are technically separate arguments to the kernel for all intents and
8504 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8505 // index, it will fold into an referal to the Global, if we then encounter
8506 // this folded GEP during replacement all of the references to the
8507 // Global in the kernel will be replaced with the argument we have generated
8508 // that corresponds to it, including any other GEP's that refer to the
8509 // Global that may be other arguments. This will invalidate all of the other
8510 // preceding mapped arguments that refer to the same global that may be
8511 // separate segments. To prevent this, we defer global processing until all
8512 // other processing has been performed.
8515 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8516 continue;
8517 }
8518
8520 continue;
8521
8522 ReplaceValue(Input, InputCopy, Func);
8523 }
8524
8525 // Replace all of our deferred Input values, currently just Globals.
8526 for (auto Deferred : DeferredReplacement)
8527 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8528
8529 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8530 ValueReplacementMap);
8531 return Func;
8532}
8533/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8534/// of pointers containing shared data between the parent task and the created
8535/// task.
8537 IRBuilderBase &Builder,
8538 Value *TaskWithPrivates,
8539 Type *TaskWithPrivatesTy) {
8540
8541 Type *TaskTy = OMPIRBuilder.Task;
8542 LLVMContext &Ctx = Builder.getContext();
8543 Value *TaskT =
8544 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8545 Value *Shareds = TaskT;
8546 // TaskWithPrivatesTy can be one of the following
8547 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8548 // %struct.privates }
8549 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8550 //
8551 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8552 // its first member has to be the task descriptor. TaskTy is the type of the
8553 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8554 // first member of TaskT, gives us the pointer to shared data.
8555 if (TaskWithPrivatesTy != TaskTy)
8556 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8557 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8558}
8559/// Create an entry point for a target task with the following.
8560/// It'll have the following signature
8561/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8562/// This function is called from emitTargetTask once the
8563/// code to launch the target kernel has been outlined already.
8564/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8565/// into the task structure so that the deferred target task can access this
8566/// data even after the stack frame of the generating task has been rolled
8567/// back. Offloading arrays contain base pointers, pointers, sizes etc
8568/// of the data that the target kernel will access. These in effect are the
8569/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8571 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8572 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8573 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8574
8575 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8576 // This is because PrivatesTy is the type of the structure in which
8577 // we pass the offloading arrays to the deferred target task.
8578 assert((!NumOffloadingArrays || PrivatesTy) &&
8579 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8580 "to privatize");
8581
8582 Module &M = OMPBuilder.M;
8583 // KernelLaunchFunction is the target launch function, i.e.
8584 // the function that sets up kernel arguments and calls
8585 // __tgt_target_kernel to launch the kernel on the device.
8586 //
8587 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8588
8589 // StaleCI is the CallInst which is the call to the outlined
8590 // target kernel launch function. If there are local live-in values
8591 // that the outlined function uses then these are aggregated into a structure
8592 // which is passed as the second argument. If there are no local live-in
8593 // values or if all values used by the outlined kernel are global variables,
8594 // then there's only one argument, the threadID. So, StaleCI can be
8595 //
8596 // %structArg = alloca { ptr, ptr }, align 8
8597 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8598 // store ptr %20, ptr %gep_, align 8
8599 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8600 // store ptr %21, ptr %gep_8, align 8
8601 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8602 //
8603 // OR
8604 //
8605 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8607 StaleCI->getIterator());
8608
8609 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8610
8611 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8612 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8613 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8614
8615 auto ProxyFnTy =
8616 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8617 /* isVarArg */ false);
8618 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8619 ".omp_target_task_proxy_func",
8620 Builder.GetInsertBlock()->getModule());
8621 Value *ThreadId = ProxyFn->getArg(0);
8622 Value *TaskWithPrivates = ProxyFn->getArg(1);
8623 ThreadId->setName("thread.id");
8624 TaskWithPrivates->setName("task");
8625
8626 bool HasShareds = SharedArgsOperandNo > 0;
8627 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8628 BasicBlock *EntryBB =
8629 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8630 Builder.SetInsertPoint(EntryBB);
8631
8632 SmallVector<Value *> KernelLaunchArgs;
8633 KernelLaunchArgs.reserve(StaleCI->arg_size());
8634 KernelLaunchArgs.push_back(ThreadId);
8635
8636 if (HasOffloadingArrays) {
8637 assert(TaskTy != TaskWithPrivatesTy &&
8638 "If there are offloading arrays to pass to the target"
8639 "TaskTy cannot be the same as TaskWithPrivatesTy");
8640 (void)TaskTy;
8641 Value *Privates =
8642 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8643 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8644 KernelLaunchArgs.push_back(
8645 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8646 }
8647
8648 if (HasShareds) {
8649 auto *ArgStructAlloca =
8650 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8651 assert(ArgStructAlloca &&
8652 "Unable to find the alloca instruction corresponding to arguments "
8653 "for extracted function");
8654 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8655 std::optional<TypeSize> ArgAllocSize =
8656 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8657 assert(ArgStructType && ArgAllocSize &&
8658 "Unable to determine size of arguments for extracted function");
8659 uint64_t StructSize = ArgAllocSize->getFixedValue();
8660
8661 AllocaInst *NewArgStructAlloca =
8662 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8663
8664 Value *SharedsSize = Builder.getInt64(StructSize);
8665
8667 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8668
8669 Builder.CreateMemCpy(
8670 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8671 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8672 KernelLaunchArgs.push_back(NewArgStructAlloca);
8673 }
8674 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8675 Builder.CreateRetVoid();
8676 return ProxyFn;
8677}
8679
8680 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8681 return GEP->getSourceElementType();
8682 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8683 return Alloca->getAllocatedType();
8684
8685 llvm_unreachable("Unhandled Instruction type");
8686 return nullptr;
8687}
8688// This function returns a struct that has at most two members.
8689// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8690// descriptor. The second member, if needed, is a struct containing arrays
8691// that need to be passed to the offloaded target kernel. For example,
8692// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8693// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8694// respectively, then the types created by this function are
8695//
8696// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8697// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8698// %struct.privates }
8699// %struct.task_with_privates is returned by this function.
8700// If there aren't any offloading arrays to pass to the target kernel,
8701// %struct.kmp_task_ompbuilder_t is returned.
8702static StructType *
8704 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8705
8706 if (OffloadingArraysToPrivatize.empty())
8707 return OMPIRBuilder.Task;
8708
8709 SmallVector<Type *, 4> StructFieldTypes;
8710 for (Value *V : OffloadingArraysToPrivatize) {
8711 assert(V->getType()->isPointerTy() &&
8712 "Expected pointer to array to privatize. Got a non-pointer value "
8713 "instead");
8714 Type *ArrayTy = getOffloadingArrayType(V);
8715 assert(ArrayTy && "ArrayType cannot be nullptr");
8716 StructFieldTypes.push_back(ArrayTy);
8717 }
8718 StructType *PrivatesStructTy =
8719 StructType::create(StructFieldTypes, "struct.privates");
8720 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8721 "struct.task_with_privates");
8722}
8724 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8725 TargetRegionEntryInfo &EntryInfo,
8727 Function *&OutlinedFn, Constant *&OutlinedFnID,
8731
8732 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8733 [&](StringRef EntryFnName) {
8734 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8735 EntryFnName, Inputs, CBFunc,
8736 ArgAccessorFuncCB);
8737 };
8738
8739 return OMPBuilder.emitTargetRegionFunction(
8740 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8741 OutlinedFnID);
8742}
8743
8745 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8748 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8749
8750 // The following explains the code-gen scenario for the `target` directive. A
8751 // similar scneario is followed for other device-related directives (e.g.
8752 // `target enter data`) but in similar fashion since we only need to emit task
8753 // that encapsulates the proper runtime call.
8754 //
8755 // When we arrive at this function, the target region itself has been
8756 // outlined into the function OutlinedFn.
8757 // So at ths point, for
8758 // --------------------------------------------------------------
8759 // void user_code_that_offloads(...) {
8760 // omp target depend(..) map(from:a) map(to:b) private(i)
8761 // do i = 1, 10
8762 // a(i) = b(i) + n
8763 // }
8764 //
8765 // --------------------------------------------------------------
8766 //
8767 // we have
8768 //
8769 // --------------------------------------------------------------
8770 //
8771 // void user_code_that_offloads(...) {
8772 // %.offload_baseptrs = alloca [2 x ptr], align 8
8773 // %.offload_ptrs = alloca [2 x ptr], align 8
8774 // %.offload_mappers = alloca [2 x ptr], align 8
8775 // ;; target region has been outlined and now we need to
8776 // ;; offload to it via a target task.
8777 // }
8778 // void outlined_device_function(ptr a, ptr b, ptr n) {
8779 // n = *n_ptr;
8780 // do i = 1, 10
8781 // a(i) = b(i) + n
8782 // }
8783 //
8784 // We have to now do the following
8785 // (i) Make an offloading call to outlined_device_function using the OpenMP
8786 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8787 // emitted by emitKernelLaunch
8788 // (ii) Create a task entry point function that calls kernel_launch_function
8789 // and is the entry point for the target task. See
8790 // '@.omp_target_task_proxy_func in the pseudocode below.
8791 // (iii) Create a task with the task entry point created in (ii)
8792 //
8793 // That is we create the following
8794 // struct task_with_privates {
8795 // struct kmp_task_ompbuilder_t task_struct;
8796 // struct privates {
8797 // [2 x ptr] ; baseptrs
8798 // [2 x ptr] ; ptrs
8799 // [2 x i64] ; sizes
8800 // }
8801 // }
8802 // void user_code_that_offloads(...) {
8803 // %.offload_baseptrs = alloca [2 x ptr], align 8
8804 // %.offload_ptrs = alloca [2 x ptr], align 8
8805 // %.offload_sizes = alloca [2 x i64], align 8
8806 //
8807 // %structArg = alloca { ptr, ptr, ptr }, align 8
8808 // %strucArg[0] = a
8809 // %strucArg[1] = b
8810 // %strucArg[2] = &n
8811 //
8812 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8813 // sizeof(kmp_task_ompbuilder_t),
8814 // sizeof(structArg),
8815 // @.omp_target_task_proxy_func,
8816 // ...)
8817 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8818 // sizeof(structArg))
8819 // memcpy(target_task_with_privates->privates->baseptrs,
8820 // offload_baseptrs, sizeof(offload_baseptrs)
8821 // memcpy(target_task_with_privates->privates->ptrs,
8822 // offload_ptrs, sizeof(offload_ptrs)
8823 // memcpy(target_task_with_privates->privates->sizes,
8824 // offload_sizes, sizeof(offload_sizes)
8825 // dependencies_array = ...
8826 // ;; if nowait not present
8827 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8828 // call @__kmpc_omp_task_begin_if0(...)
8829 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8830 // %target_task_with_privates)
8831 // call @__kmpc_omp_task_complete_if0(...)
8832 // }
8833 //
8834 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8835 // ptr %task) {
8836 // %structArg = alloca {ptr, ptr, ptr}
8837 // %task_ptr = getelementptr(%task, 0, 0)
8838 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8839 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8840 //
8841 // %offloading_arrays = getelementptr(%task, 0, 1)
8842 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8843 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8844 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8845 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8846 // %offload_sizes, %structArg)
8847 // }
8848 //
8849 // We need the proxy function because the signature of the task entry point
8850 // expected by kmpc_omp_task is always the same and will be different from
8851 // that of the kernel_launch function.
8852 //
8853 // kernel_launch_function is generated by emitKernelLaunch and has the
8854 // always_inline attribute. For this example, it'll look like so:
8855 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8856 // %offload_sizes, %structArg) alwaysinline {
8857 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8858 // ; load aggregated data from %structArg
8859 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8860 // ; offload_sizes
8861 // call i32 @__tgt_target_kernel(...,
8862 // outlined_device_function,
8863 // ptr %kernel_args)
8864 // }
8865 // void outlined_device_function(ptr a, ptr b, ptr n) {
8866 // n = *n_ptr;
8867 // do i = 1, 10
8868 // a(i) = b(i) + n
8869 // }
8870 //
8871 BasicBlock *TargetTaskBodyBB =
8872 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8873 BasicBlock *TargetTaskAllocaBB =
8874 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8875
8876 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8877 TargetTaskAllocaBB->begin());
8878 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8879
8880 OutlineInfo OI;
8881 OI.EntryBB = TargetTaskAllocaBB;
8882 OI.OuterAllocaBB = AllocaIP.getBlock();
8883
8884 // Add the thread ID argument.
8887 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8888
8889 // Generate the task body which will subsequently be outlined.
8890 Builder.restoreIP(TargetTaskBodyIP);
8891 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8892 return Err;
8893
8894 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8895 // it is given. These blocks are enumerated by
8896 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8897 // to be outside the region. In other words, OI.ExitBlock is expected to be
8898 // the start of the region after the outlining. We used to set OI.ExitBlock
8899 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8900 // except when the task body is a single basic block. In that case,
8901 // OI.ExitBlock is set to the single task body block and will get left out of
8902 // the outlining process. So, simply create a new empty block to which we
8903 // uncoditionally branch from where TaskBodyCB left off
8904 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8905 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8906 /*IsFinished=*/true);
8907
8908 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8909 bool NeedsTargetTask = HasNoWait && DeviceID;
8910 if (NeedsTargetTask) {
8911 for (auto *V :
8912 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8913 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8914 RTArgs.SizesArray}) {
8916 OffloadingArraysToPrivatize.push_back(V);
8918 }
8919 }
8920 }
8921 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8922 DeviceID, OffloadingArraysToPrivatize](
8923 Function &OutlinedFn) mutable {
8924 assert(OutlinedFn.hasOneUse() &&
8925 "there must be a single user for the outlined function");
8926
8927 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8928
8929 // The first argument of StaleCI is always the thread id.
8930 // The next few arguments are the pointers to offloading arrays
8931 // if any. (see OffloadingArraysToPrivatize)
8932 // Finally, all other local values that are live-in into the outlined region
8933 // end up in a structure whose pointer is passed as the last argument. This
8934 // piece of data is passed in the "shared" field of the task structure. So,
8935 // we know we have to pass shareds to the task if the number of arguments is
8936 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8937 // thread id. Further, for safety, we assert that the number of arguments of
8938 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8939 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8940 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8941 assert((!HasShareds ||
8942 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8943 "Wrong number of arguments for StaleCI when shareds are present");
8944 int SharedArgOperandNo =
8945 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8946
8947 StructType *TaskWithPrivatesTy =
8948 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8949 StructType *PrivatesTy = nullptr;
8950
8951 if (!OffloadingArraysToPrivatize.empty())
8952 PrivatesTy =
8953 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8954
8956 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8957 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8958
8959 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8960 << "\n");
8961
8962 Builder.SetInsertPoint(StaleCI);
8963
8964 // Gather the arguments for emitting the runtime call.
8965 uint32_t SrcLocStrSize;
8966 Constant *SrcLocStr =
8968 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8969
8970 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8971 //
8972 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8973 // the DeviceID to the deferred task and also since
8974 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8975 Function *TaskAllocFn =
8976 !NeedsTargetTask
8977 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8979 OMPRTL___kmpc_omp_target_task_alloc);
8980
8981 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8982 // call.
8983 Value *ThreadID = getOrCreateThreadID(Ident);
8984
8985 // Argument - `sizeof_kmp_task_t` (TaskSize)
8986 // Tasksize refers to the size in bytes of kmp_task_t data structure
8987 // plus any other data to be passed to the target task, if any, which
8988 // is packed into a struct. kmp_task_t and the struct so created are
8989 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8990 Value *TaskSize = Builder.getInt64(
8991 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8992
8993 // Argument - `sizeof_shareds` (SharedsSize)
8994 // SharedsSize refers to the shareds array size in the kmp_task_t data
8995 // structure.
8996 Value *SharedsSize = Builder.getInt64(0);
8997 if (HasShareds) {
8998 auto *ArgStructAlloca =
8999 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9000 assert(ArgStructAlloca &&
9001 "Unable to find the alloca instruction corresponding to arguments "
9002 "for extracted function");
9003 std::optional<TypeSize> ArgAllocSize =
9004 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9005 assert(ArgAllocSize &&
9006 "Unable to determine size of arguments for extracted function");
9007 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9008 }
9009
9010 // Argument - `flags`
9011 // Task is tied iff (Flags & 1) == 1.
9012 // Task is untied iff (Flags & 1) == 0.
9013 // Task is final iff (Flags & 2) == 2.
9014 // Task is not final iff (Flags & 2) == 0.
9015 // A target task is not final and is untied.
9016 Value *Flags = Builder.getInt32(0);
9017
9018 // Emit the @__kmpc_omp_task_alloc runtime call
9019 // The runtime call returns a pointer to an area where the task captured
9020 // variables must be copied before the task is run (TaskData)
9021 CallInst *TaskData = nullptr;
9022
9023 SmallVector<llvm::Value *> TaskAllocArgs = {
9024 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9025 /*flags=*/Flags,
9026 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9027 /*task_func=*/ProxyFn};
9028
9029 if (NeedsTargetTask) {
9030 assert(DeviceID && "Expected non-empty device ID.");
9031 TaskAllocArgs.push_back(DeviceID);
9032 }
9033
9034 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9035
9036 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9037 if (HasShareds) {
9038 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9040 *this, Builder, TaskData, TaskWithPrivatesTy);
9041 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9042 SharedsSize);
9043 }
9044 if (!OffloadingArraysToPrivatize.empty()) {
9045 Value *Privates =
9046 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9047 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9048 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9049 [[maybe_unused]] Type *ArrayType =
9050 getOffloadingArrayType(PtrToPrivatize);
9051 assert(ArrayType && "ArrayType cannot be nullptr");
9052
9053 Type *ElementType = PrivatesTy->getElementType(i);
9054 assert(ElementType == ArrayType &&
9055 "ElementType should match ArrayType");
9056 (void)ArrayType;
9057
9058 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9059 Builder.CreateMemCpy(
9060 Dst, Alignment, PtrToPrivatize, Alignment,
9061 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9062 }
9063 }
9064
9065 Value *DepArray = emitTaskDependencies(*this, Dependencies);
9066
9067 // ---------------------------------------------------------------
9068 // V5.2 13.8 target construct
9069 // If the nowait clause is present, execution of the target task
9070 // may be deferred. If the nowait clause is not present, the target task is
9071 // an included task.
9072 // ---------------------------------------------------------------
9073 // The above means that the lack of a nowait on the target construct
9074 // translates to '#pragma omp task if(0)'
9075 if (!NeedsTargetTask) {
9076 if (DepArray) {
9077 Function *TaskWaitFn =
9078 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9080 TaskWaitFn,
9081 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9082 /*ndeps=*/Builder.getInt32(Dependencies.size()),
9083 /*dep_list=*/DepArray,
9084 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9085 /*noalias_dep_list=*/
9087 }
9088 // Included task.
9089 Function *TaskBeginFn =
9090 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9091 Function *TaskCompleteFn =
9092 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9093 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9094 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9095 CI->setDebugLoc(StaleCI->getDebugLoc());
9096 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9097 } else if (DepArray) {
9098 // HasNoWait - meaning the task may be deferred. Call
9099 // __kmpc_omp_task_with_deps if there are dependencies,
9100 // else call __kmpc_omp_task
9101 Function *TaskFn =
9102 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9104 TaskFn,
9105 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
9106 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
9108 } else {
9109 // Emit the @__kmpc_omp_task runtime call to spawn the task
9110 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9111 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9112 }
9113
9114 StaleCI->eraseFromParent();
9115 for (Instruction *I : llvm::reverse(ToBeDeleted))
9116 I->eraseFromParent();
9117 };
9118 addOutlineInfo(std::move(OI));
9119
9120 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9121 << *(Builder.GetInsertBlock()) << "\n");
9122 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9123 << *(Builder.GetInsertBlock()->getParent()->getParent())
9124 << "\n");
9125 return Builder.saveIP();
9126}
9127
9129 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9130 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9131 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9132 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9133 if (Error Err =
9134 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9135 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9136 return Err;
9137 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9138 return Error::success();
9139}
9140
9141static void emitTargetCall(
9142 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9147 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9152 bool HasNoWait, Value *DynCGroupMem,
9153 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9154 // Generate a function call to the host fallback implementation of the target
9155 // region. This is called by the host when no offload entry was generated for
9156 // the target region and when the offloading call fails at runtime.
9157 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9159 Builder.restoreIP(IP);
9160 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
9161 return Builder.saveIP();
9162 };
9163
9164 bool HasDependencies = Dependencies.size() > 0;
9165 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9166
9168
9169 auto TaskBodyCB =
9170 [&](Value *DeviceID, Value *RTLoc,
9171 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9172 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9173 // produce any.
9175 // emitKernelLaunch makes the necessary runtime call to offload the
9176 // kernel. We then outline all that code into a separate function
9177 // ('kernel_launch_function' in the pseudo code above). This function is
9178 // then called by the target task proxy function (see
9179 // '@.omp_target_task_proxy_func' in the pseudo code above)
9180 // "@.omp_target_task_proxy_func' is generated by
9181 // emitTargetTaskProxyFunction.
9182 if (OutlinedFnID && DeviceID)
9183 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9184 EmitTargetCallFallbackCB, KArgs,
9185 DeviceID, RTLoc, TargetTaskAllocaIP);
9186
9187 // We only need to do the outlining if `DeviceID` is set to avoid calling
9188 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9189 // generating the `else` branch of an `if` clause.
9190 //
9191 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9192 // In this case, we execute the host implementation directly.
9193 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9194 }());
9195
9196 OMPBuilder.Builder.restoreIP(AfterIP);
9197 return Error::success();
9198 };
9199
9200 auto &&EmitTargetCallElse =
9201 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9203 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9204 // produce any.
9206 if (RequiresOuterTargetTask) {
9207 // Arguments that are intended to be directly forwarded to an
9208 // emitKernelLaunch call are pased as nullptr, since
9209 // OutlinedFnID=nullptr results in that call not being done.
9211 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9212 /*RTLoc=*/nullptr, AllocaIP,
9213 Dependencies, EmptyRTArgs, HasNoWait);
9214 }
9215 return EmitTargetCallFallbackCB(Builder.saveIP());
9216 }());
9217
9218 Builder.restoreIP(AfterIP);
9219 return Error::success();
9220 };
9221
9222 auto &&EmitTargetCallThen =
9223 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9225 Info.HasNoWait = HasNoWait;
9226 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9228 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9229 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9230 /*IsNonContiguous=*/true,
9231 /*ForEndCall=*/false))
9232 return Err;
9233
9234 SmallVector<Value *, 3> NumTeamsC;
9235 for (auto [DefaultVal, RuntimeVal] :
9236 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9237 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9238 : Builder.getInt32(DefaultVal));
9239
9240 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9241 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9242 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9243 if (Clause)
9244 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9245 /*isSigned=*/false);
9246 return Clause;
9247 };
9248 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9249 if (Clause)
9250 Result =
9251 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9252 Result, Clause)
9253 : Clause;
9254 };
9255
9256 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9257 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9258 SmallVector<Value *, 3> NumThreadsC;
9259 Value *MaxThreadsClause =
9260 RuntimeAttrs.TeamsThreadLimit.size() == 1
9261 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9262 : nullptr;
9263
9264 for (auto [TeamsVal, TargetVal] : zip_equal(
9265 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9266 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9267 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9268
9269 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9270 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9271
9272 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9273 }
9274
9275 unsigned NumTargetItems = Info.NumberOfPtrs;
9276 uint32_t SrcLocStrSize;
9277 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9278 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9279 llvm::omp::IdentFlag(0), 0);
9280
9281 Value *TripCount = RuntimeAttrs.LoopTripCount
9282 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9283 Builder.getInt64Ty(),
9284 /*isSigned=*/false)
9285 : Builder.getInt64(0);
9286
9287 // Request zero groupprivate bytes by default.
9288 if (!DynCGroupMem)
9289 DynCGroupMem = Builder.getInt32(0);
9290
9292 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9293 HasNoWait, DynCGroupMemFallback);
9294
9295 // Assume no error was returned because TaskBodyCB and
9296 // EmitTargetCallFallbackCB don't produce any.
9298 // The presence of certain clauses on the target directive require the
9299 // explicit generation of the target task.
9300 if (RequiresOuterTargetTask)
9301 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9302 RTLoc, AllocaIP, Dependencies,
9303 KArgs.RTArgs, Info.HasNoWait);
9304
9305 return OMPBuilder.emitKernelLaunch(
9306 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9307 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9308 }());
9309
9310 Builder.restoreIP(AfterIP);
9311 return Error::success();
9312 };
9313
9314 // If we don't have an ID for the target region, it means an offload entry
9315 // wasn't created. In this case we just run the host fallback directly and
9316 // ignore any potential 'if' clauses.
9317 if (!OutlinedFnID) {
9318 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9319 return;
9320 }
9321
9322 // If there's no 'if' clause, only generate the kernel launch code path.
9323 if (!IfCond) {
9324 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9325 return;
9326 }
9327
9328 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9329 EmitTargetCallElse, AllocaIP));
9330}
9331
9333 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9334 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9335 TargetRegionEntryInfo &EntryInfo,
9336 const TargetKernelDefaultAttrs &DefaultAttrs,
9337 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9338 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9341 CustomMapperCallbackTy CustomMapperCB,
9342 const SmallVector<DependData> &Dependencies, bool HasNowait,
9343 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9344
9345 if (!updateToLocation(Loc))
9346 return InsertPointTy();
9347
9348 Builder.restoreIP(CodeGenIP);
9349
9350 Function *OutlinedFn;
9351 Constant *OutlinedFnID = nullptr;
9352 // The target region is outlined into its own function. The LLVM IR for
9353 // the target region itself is generated using the callbacks CBFunc
9354 // and ArgAccessorFuncCB
9356 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9357 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9358 return Err;
9359
9360 // If we are not on the target device, then we need to generate code
9361 // to make a remote call (offload) to the previously outlined function
9362 // that represents the target region. Do that now.
9363 if (!Config.isTargetDevice())
9364 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9365 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9366 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9367 DynCGroupMemFallback);
9368 return Builder.saveIP();
9369}
9370
9371std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9372 StringRef FirstSeparator,
9373 StringRef Separator) {
9374 SmallString<128> Buffer;
9375 llvm::raw_svector_ostream OS(Buffer);
9376 StringRef Sep = FirstSeparator;
9377 for (StringRef Part : Parts) {
9378 OS << Sep << Part;
9379 Sep = Separator;
9380 }
9381 return OS.str().str();
9382}
9383
9384std::string
9386 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9387 Config.separator());
9388}
9389
9391 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9392 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9393 if (Elem.second) {
9394 assert(Elem.second->getValueType() == Ty &&
9395 "OMP internal variable has different type than requested");
9396 } else {
9397 // TODO: investigate the appropriate linkage type used for the global
9398 // variable for possibly changing that to internal or private, or maybe
9399 // create different versions of the function for different OMP internal
9400 // variables.
9401 const DataLayout &DL = M.getDataLayout();
9402 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9403 // default global AS is 1.
9404 // See double-target-call-with-declare-target.f90 and
9405 // declare-target-vars-in-target-region.f90 libomptarget
9406 // tests.
9407 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9408 : M.getTargetTriple().isAMDGPU()
9409 ? 0
9410 : DL.getDefaultGlobalsAddressSpace();
9411 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9414 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9415 Constant::getNullValue(Ty), Elem.first(),
9416 /*InsertBefore=*/nullptr,
9417 GlobalValue::NotThreadLocal, AddressSpaceVal);
9418 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9419 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9420 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9421 Elem.second = GV;
9422 }
9423
9424 return Elem.second;
9425}
9426
9427Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9428 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9429 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9430 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9431}
9432
9434 LLVMContext &Ctx = Builder.getContext();
9435 Value *Null =
9436 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9437 Value *SizeGep =
9438 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9439 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9440 return SizePtrToInt;
9441}
9442
9445 std::string VarName) {
9446 llvm::Constant *MaptypesArrayInit =
9447 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9448 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9449 M, MaptypesArrayInit->getType(),
9450 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9451 VarName);
9452 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9453 return MaptypesArrayGlobal;
9454}
9455
9457 InsertPointTy AllocaIP,
9458 unsigned NumOperands,
9459 struct MapperAllocas &MapperAllocas) {
9460 if (!updateToLocation(Loc))
9461 return;
9462
9463 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9464 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9465 Builder.restoreIP(AllocaIP);
9466 AllocaInst *ArgsBase = Builder.CreateAlloca(
9467 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9468 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9469 ".offload_ptrs");
9470 AllocaInst *ArgSizes = Builder.CreateAlloca(
9471 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9473 MapperAllocas.ArgsBase = ArgsBase;
9474 MapperAllocas.Args = Args;
9475 MapperAllocas.ArgSizes = ArgSizes;
9476}
9477
9479 Function *MapperFunc, Value *SrcLocInfo,
9480 Value *MaptypesArg, Value *MapnamesArg,
9482 int64_t DeviceID, unsigned NumOperands) {
9483 if (!updateToLocation(Loc))
9484 return;
9485
9486 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9487 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9488 Value *ArgsBaseGEP =
9489 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9490 {Builder.getInt32(0), Builder.getInt32(0)});
9491 Value *ArgsGEP =
9492 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9493 {Builder.getInt32(0), Builder.getInt32(0)});
9494 Value *ArgSizesGEP =
9495 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9496 {Builder.getInt32(0), Builder.getInt32(0)});
9497 Value *NullPtr =
9498 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9499 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9500 Builder.getInt32(NumOperands),
9501 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9502 MaptypesArg, MapnamesArg, NullPtr});
9503}
9504
9506 TargetDataRTArgs &RTArgs,
9507 TargetDataInfo &Info,
9508 bool ForEndCall) {
9509 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9510 "expected region end call to runtime only when end call is separate");
9511 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9512 auto VoidPtrTy = UnqualPtrTy;
9513 auto VoidPtrPtrTy = UnqualPtrTy;
9514 auto Int64Ty = Type::getInt64Ty(M.getContext());
9515 auto Int64PtrTy = UnqualPtrTy;
9516
9517 if (!Info.NumberOfPtrs) {
9518 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9519 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9520 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9521 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9522 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9523 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9524 return;
9525 }
9526
9527 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9528 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9529 Info.RTArgs.BasePointersArray,
9530 /*Idx0=*/0, /*Idx1=*/0);
9531 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9532 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9533 /*Idx0=*/0,
9534 /*Idx1=*/0);
9535 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9536 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9537 /*Idx0=*/0, /*Idx1=*/0);
9538 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9539 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9540 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9541 : Info.RTArgs.MapTypesArray,
9542 /*Idx0=*/0,
9543 /*Idx1=*/0);
9544
9545 // Only emit the mapper information arrays if debug information is
9546 // requested.
9547 if (!Info.EmitDebug)
9548 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9549 else
9550 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9551 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9552 /*Idx0=*/0,
9553 /*Idx1=*/0);
9554 // If there is no user-defined mapper, set the mapper array to nullptr to
9555 // avoid an unnecessary data privatization
9556 if (!Info.HasMapper)
9557 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9558 else
9559 RTArgs.MappersArray =
9560 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9561}
9562
9564 InsertPointTy CodeGenIP,
9565 MapInfosTy &CombinedInfo,
9566 TargetDataInfo &Info) {
9568 CombinedInfo.NonContigInfo;
9569
9570 // Build an array of struct descriptor_dim and then assign it to
9571 // offload_args.
9572 //
9573 // struct descriptor_dim {
9574 // uint64_t offset;
9575 // uint64_t count;
9576 // uint64_t stride
9577 // };
9578 Type *Int64Ty = Builder.getInt64Ty();
9580 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9581 "struct.descriptor_dim");
9582
9583 enum { OffsetFD = 0, CountFD, StrideFD };
9584 // We need two index variable here since the size of "Dims" is the same as
9585 // the size of Components, however, the size of offset, count, and stride is
9586 // equal to the size of base declaration that is non-contiguous.
9587 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9588 // Skip emitting ir if dimension size is 1 since it cannot be
9589 // non-contiguous.
9590 if (NonContigInfo.Dims[I] == 1)
9591 continue;
9592 Builder.restoreIP(AllocaIP);
9593 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9594 AllocaInst *DimsAddr =
9595 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9596 Builder.restoreIP(CodeGenIP);
9597 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9598 unsigned RevIdx = EE - II - 1;
9599 Value *DimsLVal = Builder.CreateInBoundsGEP(
9600 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9601 // Offset
9602 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9603 Builder.CreateAlignedStore(
9604 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9605 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9606 // Count
9607 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9608 Builder.CreateAlignedStore(
9609 NonContigInfo.Counts[L][RevIdx], CountLVal,
9610 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9611 // Stride
9612 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9613 Builder.CreateAlignedStore(
9614 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9615 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9616 }
9617 // args[I] = &dims
9618 Builder.restoreIP(CodeGenIP);
9619 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9620 DimsAddr, Builder.getPtrTy());
9621 Value *P = Builder.CreateConstInBoundsGEP2_32(
9622 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9623 Info.RTArgs.PointersArray, 0, I);
9624 Builder.CreateAlignedStore(
9625 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9626 ++L;
9627 }
9628}
9629
9630void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9631 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9632 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9633 BasicBlock *ExitBB, bool IsInit) {
9634 StringRef Prefix = IsInit ? ".init" : ".del";
9635
9636 // Evaluate if this is an array section.
9638 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9639 Value *IsArray =
9640 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9641 Value *DeleteBit = Builder.CreateAnd(
9642 MapType,
9643 Builder.getInt64(
9644 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9645 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9646 Value *DeleteCond;
9647 Value *Cond;
9648 if (IsInit) {
9649 // base != begin?
9650 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9651 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9652 DeleteCond = Builder.CreateIsNull(
9653 DeleteBit,
9654 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9655 } else {
9656 Cond = IsArray;
9657 DeleteCond = Builder.CreateIsNotNull(
9658 DeleteBit,
9659 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9660 }
9661 Cond = Builder.CreateAnd(Cond, DeleteCond);
9662 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9663
9664 emitBlock(BodyBB, MapperFn);
9665 // Get the array size by multiplying element size and element number (i.e., \p
9666 // Size).
9667 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9668 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9669 // memory allocation/deletion purpose only.
9670 Value *MapTypeArg = Builder.CreateAnd(
9671 MapType,
9672 Builder.getInt64(
9673 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9674 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9675 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9676 MapTypeArg = Builder.CreateOr(
9677 MapTypeArg,
9678 Builder.getInt64(
9679 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9680 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9681
9682 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9683 // data structure.
9684 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9685 ArraySize, MapTypeArg, MapName};
9687 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9688 OffloadingArgs);
9689}
9690
9693 llvm::Value *BeginArg)>
9694 GenMapInfoCB,
9695 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9696 SmallVector<Type *> Params;
9697 Params.emplace_back(Builder.getPtrTy());
9698 Params.emplace_back(Builder.getPtrTy());
9699 Params.emplace_back(Builder.getPtrTy());
9700 Params.emplace_back(Builder.getInt64Ty());
9701 Params.emplace_back(Builder.getInt64Ty());
9702 Params.emplace_back(Builder.getPtrTy());
9703
9704 auto *FnTy =
9705 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9706
9707 SmallString<64> TyStr;
9708 raw_svector_ostream Out(TyStr);
9709 Function *MapperFn =
9711 MapperFn->addFnAttr(Attribute::NoInline);
9712 MapperFn->addFnAttr(Attribute::NoUnwind);
9713 MapperFn->addParamAttr(0, Attribute::NoUndef);
9714 MapperFn->addParamAttr(1, Attribute::NoUndef);
9715 MapperFn->addParamAttr(2, Attribute::NoUndef);
9716 MapperFn->addParamAttr(3, Attribute::NoUndef);
9717 MapperFn->addParamAttr(4, Attribute::NoUndef);
9718 MapperFn->addParamAttr(5, Attribute::NoUndef);
9719
9720 // Start the mapper function code generation.
9721 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9722 auto SavedIP = Builder.saveIP();
9723 Builder.SetInsertPoint(EntryBB);
9724
9725 Value *MapperHandle = MapperFn->getArg(0);
9726 Value *BaseIn = MapperFn->getArg(1);
9727 Value *BeginIn = MapperFn->getArg(2);
9728 Value *Size = MapperFn->getArg(3);
9729 Value *MapType = MapperFn->getArg(4);
9730 Value *MapName = MapperFn->getArg(5);
9731
9732 // Compute the starting and end addresses of array elements.
9733 // Prepare common arguments for array initiation and deletion.
9734 // Convert the size in bytes into the number of array elements.
9735 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9736 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9737 Value *PtrBegin = BeginIn;
9738 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9739
9740 // Emit array initiation if this is an array section and \p MapType indicates
9741 // that memory allocation is required.
9742 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9743 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9744 MapType, MapName, ElementSize, HeadBB,
9745 /*IsInit=*/true);
9746
9747 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9748
9749 // Emit the loop header block.
9750 emitBlock(HeadBB, MapperFn);
9751 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9752 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9753 // Evaluate whether the initial condition is satisfied.
9754 Value *IsEmpty =
9755 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9756 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9757
9758 // Emit the loop body block.
9759 emitBlock(BodyBB, MapperFn);
9760 BasicBlock *LastBB = BodyBB;
9761 PHINode *PtrPHI =
9762 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9763 PtrPHI->addIncoming(PtrBegin, HeadBB);
9764
9765 // Get map clause information. Fill up the arrays with all mapped variables.
9766 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9767 if (!Info)
9768 return Info.takeError();
9769
9770 // Call the runtime API __tgt_mapper_num_components to get the number of
9771 // pre-existing components.
9772 Value *OffloadingArgs[] = {MapperHandle};
9773 Value *PreviousSize = createRuntimeFunctionCall(
9774 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9775 OffloadingArgs);
9776 Value *ShiftedPreviousSize =
9777 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9778
9779 // Fill up the runtime mapper handle for all components.
9780 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9781 Value *CurBaseArg = Info->BasePointers[I];
9782 Value *CurBeginArg = Info->Pointers[I];
9783 Value *CurSizeArg = Info->Sizes[I];
9784 Value *CurNameArg = Info->Names.size()
9785 ? Info->Names[I]
9786 : Constant::getNullValue(Builder.getPtrTy());
9787
9788 // Extract the MEMBER_OF field from the map type.
9789 Value *OriMapType = Builder.getInt64(
9790 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9791 Info->Types[I]));
9792 Value *MemberMapType =
9793 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9794
9795 // Combine the map type inherited from user-defined mapper with that
9796 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9797 // bits of the \a MapType, which is the input argument of the mapper
9798 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9799 // bits of MemberMapType.
9800 // [OpenMP 5.0], 1.2.6. map-type decay.
9801 // | alloc | to | from | tofrom | release | delete
9802 // ----------------------------------------------------------
9803 // alloc | alloc | alloc | alloc | alloc | release | delete
9804 // to | alloc | to | alloc | to | release | delete
9805 // from | alloc | alloc | from | from | release | delete
9806 // tofrom | alloc | to | from | tofrom | release | delete
9807 Value *LeftToFrom = Builder.CreateAnd(
9808 MapType,
9809 Builder.getInt64(
9810 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9811 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9812 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9813 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9814 BasicBlock *AllocElseBB =
9815 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9816 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9817 BasicBlock *ToElseBB =
9818 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9819 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9820 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9821 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9822 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9823 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9824 emitBlock(AllocBB, MapperFn);
9825 Value *AllocMapType = Builder.CreateAnd(
9826 MemberMapType,
9827 Builder.getInt64(
9828 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9829 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9830 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9831 Builder.CreateBr(EndBB);
9832 emitBlock(AllocElseBB, MapperFn);
9833 Value *IsTo = Builder.CreateICmpEQ(
9834 LeftToFrom,
9835 Builder.getInt64(
9836 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9837 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9838 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9839 // In case of to, clear OMP_MAP_FROM.
9840 emitBlock(ToBB, MapperFn);
9841 Value *ToMapType = Builder.CreateAnd(
9842 MemberMapType,
9843 Builder.getInt64(
9844 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9845 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9846 Builder.CreateBr(EndBB);
9847 emitBlock(ToElseBB, MapperFn);
9848 Value *IsFrom = Builder.CreateICmpEQ(
9849 LeftToFrom,
9850 Builder.getInt64(
9851 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9852 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9853 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9854 // In case of from, clear OMP_MAP_TO.
9855 emitBlock(FromBB, MapperFn);
9856 Value *FromMapType = Builder.CreateAnd(
9857 MemberMapType,
9858 Builder.getInt64(
9859 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9860 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9861 // In case of tofrom, do nothing.
9862 emitBlock(EndBB, MapperFn);
9863 LastBB = EndBB;
9864 PHINode *CurMapType =
9865 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9866 CurMapType->addIncoming(AllocMapType, AllocBB);
9867 CurMapType->addIncoming(ToMapType, ToBB);
9868 CurMapType->addIncoming(FromMapType, FromBB);
9869 CurMapType->addIncoming(MemberMapType, ToElseBB);
9870
9871 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9872 CurSizeArg, CurMapType, CurNameArg};
9873
9874 auto ChildMapperFn = CustomMapperCB(I);
9875 if (!ChildMapperFn)
9876 return ChildMapperFn.takeError();
9877 if (*ChildMapperFn) {
9878 // Call the corresponding mapper function.
9879 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9880 ->setDoesNotThrow();
9881 } else {
9882 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9883 // data structure.
9885 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9886 OffloadingArgs);
9887 }
9888 }
9889
9890 // Update the pointer to point to the next element that needs to be mapped,
9891 // and check whether we have mapped all elements.
9892 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9893 "omp.arraymap.next");
9894 PtrPHI->addIncoming(PtrNext, LastBB);
9895 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9896 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9897 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9898
9899 emitBlock(ExitBB, MapperFn);
9900 // Emit array deletion if this is an array section and \p MapType indicates
9901 // that deletion is required.
9902 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9903 MapType, MapName, ElementSize, DoneBB,
9904 /*IsInit=*/false);
9905
9906 // Emit the function exit block.
9907 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9908
9909 Builder.CreateRetVoid();
9910 Builder.restoreIP(SavedIP);
9911 return MapperFn;
9912}
9913
9915 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9916 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9917 bool IsNonContiguous,
9918 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9919
9920 // Reset the array information.
9921 Info.clearArrayInfo();
9922 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9923
9924 if (Info.NumberOfPtrs == 0)
9925 return Error::success();
9926
9927 Builder.restoreIP(AllocaIP);
9928 // Detect if we have any capture size requiring runtime evaluation of the
9929 // size so that a constant array could be eventually used.
9930 ArrayType *PointerArrayType =
9931 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9932
9933 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9934 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9935
9936 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9937 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9938 AllocaInst *MappersArray = Builder.CreateAlloca(
9939 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9940 Info.RTArgs.MappersArray = MappersArray;
9941
9942 // If we don't have any VLA types or other types that require runtime
9943 // evaluation, we can use a constant array for the map sizes, otherwise we
9944 // need to fill up the arrays as we do for the pointers.
9945 Type *Int64Ty = Builder.getInt64Ty();
9946 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9947 ConstantInt::get(Int64Ty, 0));
9948 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9949 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9950 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9951 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9952 if (IsNonContiguous &&
9953 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9954 CombinedInfo.Types[I] &
9955 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9956 ConstSizes[I] =
9957 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9958 else
9959 ConstSizes[I] = CI;
9960 continue;
9961 }
9962 }
9963 RuntimeSizes.set(I);
9964 }
9965
9966 if (RuntimeSizes.all()) {
9967 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9968 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9969 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9970 restoreIPandDebugLoc(Builder, CodeGenIP);
9971 } else {
9972 auto *SizesArrayInit = ConstantArray::get(
9973 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9974 std::string Name = createPlatformSpecificName({"offload_sizes"});
9975 auto *SizesArrayGbl =
9976 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9977 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9978 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9979
9980 if (!RuntimeSizes.any()) {
9981 Info.RTArgs.SizesArray = SizesArrayGbl;
9982 } else {
9983 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9984 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9985 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9986 AllocaInst *Buffer = Builder.CreateAlloca(
9987 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9988 Buffer->setAlignment(OffloadSizeAlign);
9989 restoreIPandDebugLoc(Builder, CodeGenIP);
9990 Builder.CreateMemCpy(
9991 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9992 SizesArrayGbl, OffloadSizeAlign,
9993 Builder.getIntN(
9994 IndexSize,
9995 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9996
9997 Info.RTArgs.SizesArray = Buffer;
9998 }
9999 restoreIPandDebugLoc(Builder, CodeGenIP);
10000 }
10001
10002 // The map types are always constant so we don't need to generate code to
10003 // fill arrays. Instead, we create an array constant.
10005 for (auto mapFlag : CombinedInfo.Types)
10006 Mapping.push_back(
10007 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10008 mapFlag));
10009 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10010 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10011 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10012
10013 // The information types are only built if provided.
10014 if (!CombinedInfo.Names.empty()) {
10015 auto *MapNamesArrayGbl = createOffloadMapnames(
10016 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10017 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10018 Info.EmitDebug = true;
10019 } else {
10020 Info.RTArgs.MapNamesArray =
10022 Info.EmitDebug = false;
10023 }
10024
10025 // If there's a present map type modifier, it must not be applied to the end
10026 // of a region, so generate a separate map type array in that case.
10027 if (Info.separateBeginEndCalls()) {
10028 bool EndMapTypesDiffer = false;
10029 for (uint64_t &Type : Mapping) {
10030 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10031 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10032 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10033 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10034 EndMapTypesDiffer = true;
10035 }
10036 }
10037 if (EndMapTypesDiffer) {
10038 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10039 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10040 }
10041 }
10042
10043 PointerType *PtrTy = Builder.getPtrTy();
10044 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10045 Value *BPVal = CombinedInfo.BasePointers[I];
10046 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10047 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10048 0, I);
10049 Builder.CreateAlignedStore(BPVal, BP,
10050 M.getDataLayout().getPrefTypeAlign(PtrTy));
10051
10052 if (Info.requiresDevicePointerInfo()) {
10053 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10054 CodeGenIP = Builder.saveIP();
10055 Builder.restoreIP(AllocaIP);
10056 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10057 Builder.restoreIP(CodeGenIP);
10058 if (DeviceAddrCB)
10059 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10060 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10061 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10062 if (DeviceAddrCB)
10063 DeviceAddrCB(I, BP);
10064 }
10065 }
10066
10067 Value *PVal = CombinedInfo.Pointers[I];
10068 Value *P = Builder.CreateConstInBoundsGEP2_32(
10069 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10070 I);
10071 // TODO: Check alignment correct.
10072 Builder.CreateAlignedStore(PVal, P,
10073 M.getDataLayout().getPrefTypeAlign(PtrTy));
10074
10075 if (RuntimeSizes.test(I)) {
10076 Value *S = Builder.CreateConstInBoundsGEP2_32(
10077 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10078 /*Idx0=*/0,
10079 /*Idx1=*/I);
10080 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10081 Int64Ty,
10082 /*isSigned=*/true),
10083 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10084 }
10085 // Fill up the mapper array.
10086 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10087 Value *MFunc = ConstantPointerNull::get(PtrTy);
10088
10089 auto CustomMFunc = CustomMapperCB(I);
10090 if (!CustomMFunc)
10091 return CustomMFunc.takeError();
10092 if (*CustomMFunc)
10093 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10094
10095 Value *MAddr = Builder.CreateInBoundsGEP(
10096 PointerArrayType, MappersArray,
10097 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10098 Builder.CreateAlignedStore(
10099 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10100 }
10101
10102 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10103 Info.NumberOfPtrs == 0)
10104 return Error::success();
10105 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10106 return Error::success();
10107}
10108
10110 BasicBlock *CurBB = Builder.GetInsertBlock();
10111
10112 if (!CurBB || CurBB->getTerminator()) {
10113 // If there is no insert point or the previous block is already
10114 // terminated, don't touch it.
10115 } else {
10116 // Otherwise, create a fall-through branch.
10117 Builder.CreateBr(Target);
10118 }
10119
10120 Builder.ClearInsertionPoint();
10121}
10122
10124 bool IsFinished) {
10125 BasicBlock *CurBB = Builder.GetInsertBlock();
10126
10127 // Fall out of the current block (if necessary).
10128 emitBranch(BB);
10129
10130 if (IsFinished && BB->use_empty()) {
10131 BB->eraseFromParent();
10132 return;
10133 }
10134
10135 // Place the block after the current block, if possible, or else at
10136 // the end of the function.
10137 if (CurBB && CurBB->getParent())
10138 CurFn->insert(std::next(CurBB->getIterator()), BB);
10139 else
10140 CurFn->insert(CurFn->end(), BB);
10141 Builder.SetInsertPoint(BB);
10142}
10143
10145 BodyGenCallbackTy ElseGen,
10146 InsertPointTy AllocaIP) {
10147 // If the condition constant folds and can be elided, try to avoid emitting
10148 // the condition and the dead arm of the if/else.
10149 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10150 auto CondConstant = CI->getSExtValue();
10151 if (CondConstant)
10152 return ThenGen(AllocaIP, Builder.saveIP());
10153
10154 return ElseGen(AllocaIP, Builder.saveIP());
10155 }
10156
10157 Function *CurFn = Builder.GetInsertBlock()->getParent();
10158
10159 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10160 // emit the conditional branch.
10161 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10162 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10163 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10164 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10165 // Emit the 'then' code.
10166 emitBlock(ThenBlock, CurFn);
10167 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10168 return Err;
10169 emitBranch(ContBlock);
10170 // Emit the 'else' code if present.
10171 // There is no need to emit line number for unconditional branch.
10172 emitBlock(ElseBlock, CurFn);
10173 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10174 return Err;
10175 // There is no need to emit line number for unconditional branch.
10176 emitBranch(ContBlock);
10177 // Emit the continuation block for code after the if.
10178 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10179 return Error::success();
10180}
10181
10182bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10183 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10186 "Unexpected Atomic Ordering.");
10187
10188 bool Flush = false;
10190
10191 switch (AK) {
10192 case Read:
10195 FlushAO = AtomicOrdering::Acquire;
10196 Flush = true;
10197 }
10198 break;
10199 case Write:
10200 case Compare:
10201 case Update:
10204 FlushAO = AtomicOrdering::Release;
10205 Flush = true;
10206 }
10207 break;
10208 case Capture:
10209 switch (AO) {
10211 FlushAO = AtomicOrdering::Acquire;
10212 Flush = true;
10213 break;
10215 FlushAO = AtomicOrdering::Release;
10216 Flush = true;
10217 break;
10221 Flush = true;
10222 break;
10223 default:
10224 // do nothing - leave silently.
10225 break;
10226 }
10227 }
10228
10229 if (Flush) {
10230 // Currently Flush RT call still doesn't take memory_ordering, so for when
10231 // that happens, this tries to do the resolution of which atomic ordering
10232 // to use with but issue the flush call
10233 // TODO: pass `FlushAO` after memory ordering support is added
10234 (void)FlushAO;
10235 emitFlush(Loc);
10236 }
10237
10238 // for AO == AtomicOrdering::Monotonic and all other case combinations
10239 // do nothing
10240 return Flush;
10241}
10242
10246 AtomicOrdering AO, InsertPointTy AllocaIP) {
10247 if (!updateToLocation(Loc))
10248 return Loc.IP;
10249
10250 assert(X.Var->getType()->isPointerTy() &&
10251 "OMP Atomic expects a pointer to target memory");
10252 Type *XElemTy = X.ElemTy;
10253 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10254 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10255 "OMP atomic read expected a scalar type");
10256
10257 Value *XRead = nullptr;
10258
10259 if (XElemTy->isIntegerTy()) {
10260 LoadInst *XLD =
10261 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10262 XLD->setAtomic(AO);
10263 XRead = cast<Value>(XLD);
10264 } else if (XElemTy->isStructTy()) {
10265 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10266 // target does not support `atomicrmw` of the size of the struct
10267 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10268 OldVal->setAtomic(AO);
10269 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10270 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10271 OpenMPIRBuilder::AtomicInfo atomicInfo(
10272 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10273 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10274 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10275 XRead = AtomicLoadRes.first;
10276 OldVal->eraseFromParent();
10277 } else {
10278 // We need to perform atomic op as integer
10279 IntegerType *IntCastTy =
10280 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10281 LoadInst *XLoad =
10282 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10283 XLoad->setAtomic(AO);
10284 if (XElemTy->isFloatingPointTy()) {
10285 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10286 } else {
10287 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10288 }
10289 }
10290 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10291 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10292 return Builder.saveIP();
10293}
10294
10297 AtomicOpValue &X, Value *Expr,
10298 AtomicOrdering AO, InsertPointTy AllocaIP) {
10299 if (!updateToLocation(Loc))
10300 return Loc.IP;
10301
10302 assert(X.Var->getType()->isPointerTy() &&
10303 "OMP Atomic expects a pointer to target memory");
10304 Type *XElemTy = X.ElemTy;
10305 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10306 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10307 "OMP atomic write expected a scalar type");
10308
10309 if (XElemTy->isIntegerTy()) {
10310 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10311 XSt->setAtomic(AO);
10312 } else if (XElemTy->isStructTy()) {
10313 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10314 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10315 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10316 OpenMPIRBuilder::AtomicInfo atomicInfo(
10317 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10318 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10319 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10320 OldVal->eraseFromParent();
10321 } else {
10322 // We need to bitcast and perform atomic op as integers
10323 IntegerType *IntCastTy =
10324 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10325 Value *ExprCast =
10326 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10327 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10328 XSt->setAtomic(AO);
10329 }
10330
10331 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10332 return Builder.saveIP();
10333}
10334
10337 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10338 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10339 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10340 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10341 if (!updateToLocation(Loc))
10342 return Loc.IP;
10343
10344 LLVM_DEBUG({
10345 Type *XTy = X.Var->getType();
10346 assert(XTy->isPointerTy() &&
10347 "OMP Atomic expects a pointer to target memory");
10348 Type *XElemTy = X.ElemTy;
10349 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10350 XElemTy->isPointerTy()) &&
10351 "OMP atomic update expected a scalar type");
10352 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10353 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10354 "OpenMP atomic does not support LT or GT operations");
10355 });
10356
10357 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10358 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10359 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10360 if (!AtomicResult)
10361 return AtomicResult.takeError();
10362 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10363 return Builder.saveIP();
10364}
10365
10366// FIXME: Duplicating AtomicExpand
10367Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10368 AtomicRMWInst::BinOp RMWOp) {
10369 switch (RMWOp) {
10370 case AtomicRMWInst::Add:
10371 return Builder.CreateAdd(Src1, Src2);
10372 case AtomicRMWInst::Sub:
10373 return Builder.CreateSub(Src1, Src2);
10374 case AtomicRMWInst::And:
10375 return Builder.CreateAnd(Src1, Src2);
10377 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10378 case AtomicRMWInst::Or:
10379 return Builder.CreateOr(Src1, Src2);
10380 case AtomicRMWInst::Xor:
10381 return Builder.CreateXor(Src1, Src2);
10386 case AtomicRMWInst::Max:
10387 case AtomicRMWInst::Min:
10398 llvm_unreachable("Unsupported atomic update operation");
10399 }
10400 llvm_unreachable("Unsupported atomic update operation");
10401}
10402
10403Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10404 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10406 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10407 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10408 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10409 // or a complex datatype.
10410 bool emitRMWOp = false;
10411 switch (RMWOp) {
10412 case AtomicRMWInst::Add:
10413 case AtomicRMWInst::And:
10415 case AtomicRMWInst::Or:
10416 case AtomicRMWInst::Xor:
10418 emitRMWOp = XElemTy;
10419 break;
10420 case AtomicRMWInst::Sub:
10421 emitRMWOp = (IsXBinopExpr && XElemTy);
10422 break;
10423 default:
10424 emitRMWOp = false;
10425 }
10426 emitRMWOp &= XElemTy->isIntegerTy();
10427
10428 std::pair<Value *, Value *> Res;
10429 if (emitRMWOp) {
10430 AtomicRMWInst *RMWInst =
10431 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10432 if (T.isAMDGPU()) {
10433 if (IsIgnoreDenormalMode)
10434 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10435 llvm::MDNode::get(Builder.getContext(), {}));
10436 if (!IsFineGrainedMemory)
10437 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10438 llvm::MDNode::get(Builder.getContext(), {}));
10439 if (!IsRemoteMemory)
10440 RMWInst->setMetadata("amdgpu.no.remote.memory",
10441 llvm::MDNode::get(Builder.getContext(), {}));
10442 }
10443 Res.first = RMWInst;
10444 // not needed except in case of postfix captures. Generate anyway for
10445 // consistency with the else part. Will be removed with any DCE pass.
10446 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10447 if (RMWOp == AtomicRMWInst::Xchg)
10448 Res.second = Res.first;
10449 else
10450 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10451 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10452 XElemTy->isStructTy()) {
10453 LoadInst *OldVal =
10454 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10455 OldVal->setAtomic(AO);
10456 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10457 unsigned LoadSize =
10458 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10459
10460 OpenMPIRBuilder::AtomicInfo atomicInfo(
10461 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10462 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10463 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10464 BasicBlock *CurBB = Builder.GetInsertBlock();
10465 Instruction *CurBBTI = CurBB->getTerminator();
10466 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10467 BasicBlock *ExitBB =
10468 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10469 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10470 X->getName() + ".atomic.cont");
10471 ContBB->getTerminator()->eraseFromParent();
10472 Builder.restoreIP(AllocaIP);
10473 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10474 NewAtomicAddr->setName(X->getName() + "x.new.val");
10475 Builder.SetInsertPoint(ContBB);
10476 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10477 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10478 Value *OldExprVal = PHI;
10479 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10480 if (!CBResult)
10481 return CBResult.takeError();
10482 Value *Upd = *CBResult;
10483 Builder.CreateStore(Upd, NewAtomicAddr);
10486 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10487 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10488 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10489 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10490 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10491 OldVal->eraseFromParent();
10492 Res.first = OldExprVal;
10493 Res.second = Upd;
10494
10495 if (UnreachableInst *ExitTI =
10497 CurBBTI->eraseFromParent();
10498 Builder.SetInsertPoint(ExitBB);
10499 } else {
10500 Builder.SetInsertPoint(ExitTI);
10501 }
10502 } else {
10503 IntegerType *IntCastTy =
10504 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10505 LoadInst *OldVal =
10506 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10507 OldVal->setAtomic(AO);
10508 // CurBB
10509 // | /---\
10510 // ContBB |
10511 // | \---/
10512 // ExitBB
10513 BasicBlock *CurBB = Builder.GetInsertBlock();
10514 Instruction *CurBBTI = CurBB->getTerminator();
10515 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10516 BasicBlock *ExitBB =
10517 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10518 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10519 X->getName() + ".atomic.cont");
10520 ContBB->getTerminator()->eraseFromParent();
10521 Builder.restoreIP(AllocaIP);
10522 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10523 NewAtomicAddr->setName(X->getName() + "x.new.val");
10524 Builder.SetInsertPoint(ContBB);
10525 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10526 PHI->addIncoming(OldVal, CurBB);
10527 bool IsIntTy = XElemTy->isIntegerTy();
10528 Value *OldExprVal = PHI;
10529 if (!IsIntTy) {
10530 if (XElemTy->isFloatingPointTy()) {
10531 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10532 X->getName() + ".atomic.fltCast");
10533 } else {
10534 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10535 X->getName() + ".atomic.ptrCast");
10536 }
10537 }
10538
10539 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10540 if (!CBResult)
10541 return CBResult.takeError();
10542 Value *Upd = *CBResult;
10543 Builder.CreateStore(Upd, NewAtomicAddr);
10544 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10547 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10548 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10549 Result->setVolatile(VolatileX);
10550 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10551 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10552 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10553 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10554
10555 Res.first = OldExprVal;
10556 Res.second = Upd;
10557
10558 // set Insertion point in exit block
10559 if (UnreachableInst *ExitTI =
10561 CurBBTI->eraseFromParent();
10562 Builder.SetInsertPoint(ExitBB);
10563 } else {
10564 Builder.SetInsertPoint(ExitTI);
10565 }
10566 }
10567
10568 return Res;
10569}
10570
10573 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10574 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10575 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10576 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10577 if (!updateToLocation(Loc))
10578 return Loc.IP;
10579
10580 LLVM_DEBUG({
10581 Type *XTy = X.Var->getType();
10582 assert(XTy->isPointerTy() &&
10583 "OMP Atomic expects a pointer to target memory");
10584 Type *XElemTy = X.ElemTy;
10585 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10586 XElemTy->isPointerTy()) &&
10587 "OMP atomic capture expected a scalar type");
10588 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10589 "OpenMP atomic does not support LT or GT operations");
10590 });
10591
10592 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10593 // 'x' is simply atomically rewritten with 'expr'.
10594 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10595 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10596 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10597 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10598 if (!AtomicResult)
10599 return AtomicResult.takeError();
10600 Value *CapturedVal =
10601 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10602 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10603
10604 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10605 return Builder.saveIP();
10606}
10607
10611 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10612 bool IsFailOnly) {
10613
10615 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10616 IsPostfixUpdate, IsFailOnly, Failure);
10617}
10618
10622 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10623 bool IsFailOnly, AtomicOrdering Failure) {
10624
10625 if (!updateToLocation(Loc))
10626 return Loc.IP;
10627
10628 assert(X.Var->getType()->isPointerTy() &&
10629 "OMP atomic expects a pointer to target memory");
10630 // compare capture
10631 if (V.Var) {
10632 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10633 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10634 }
10635
10636 bool IsInteger = E->getType()->isIntegerTy();
10637
10638 if (Op == OMPAtomicCompareOp::EQ) {
10639 AtomicCmpXchgInst *Result = nullptr;
10640 if (!IsInteger) {
10641 IntegerType *IntCastTy =
10642 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10643 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10644 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10645 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10646 AO, Failure);
10647 } else {
10648 Result =
10649 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10650 }
10651
10652 if (V.Var) {
10653 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10654 if (!IsInteger)
10655 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10656 assert(OldValue->getType() == V.ElemTy &&
10657 "OldValue and V must be of same type");
10658 if (IsPostfixUpdate) {
10659 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10660 } else {
10661 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10662 if (IsFailOnly) {
10663 // CurBB----
10664 // | |
10665 // v |
10666 // ContBB |
10667 // | |
10668 // v |
10669 // ExitBB <-
10670 //
10671 // where ContBB only contains the store of old value to 'v'.
10672 BasicBlock *CurBB = Builder.GetInsertBlock();
10673 Instruction *CurBBTI = CurBB->getTerminator();
10674 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10675 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10676 CurBBTI, X.Var->getName() + ".atomic.exit");
10677 BasicBlock *ContBB = CurBB->splitBasicBlock(
10678 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10679 ContBB->getTerminator()->eraseFromParent();
10680 CurBB->getTerminator()->eraseFromParent();
10681
10682 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10683
10684 Builder.SetInsertPoint(ContBB);
10685 Builder.CreateStore(OldValue, V.Var);
10686 Builder.CreateBr(ExitBB);
10687
10688 if (UnreachableInst *ExitTI =
10690 CurBBTI->eraseFromParent();
10691 Builder.SetInsertPoint(ExitBB);
10692 } else {
10693 Builder.SetInsertPoint(ExitTI);
10694 }
10695 } else {
10696 Value *CapturedValue =
10697 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10698 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10699 }
10700 }
10701 }
10702 // The comparison result has to be stored.
10703 if (R.Var) {
10704 assert(R.Var->getType()->isPointerTy() &&
10705 "r.var must be of pointer type");
10706 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10707
10708 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10709 Value *ResultCast = R.IsSigned
10710 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10711 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10712 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10713 }
10714 } else {
10715 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10716 "Op should be either max or min at this point");
10717 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10718
10719 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10720 // Let's take max as example.
10721 // OpenMP form:
10722 // x = x > expr ? expr : x;
10723 // LLVM form:
10724 // *ptr = *ptr > val ? *ptr : val;
10725 // We need to transform to LLVM form.
10726 // x = x <= expr ? x : expr;
10728 if (IsXBinopExpr) {
10729 if (IsInteger) {
10730 if (X.IsSigned)
10731 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10733 else
10734 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10736 } else {
10737 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10739 }
10740 } else {
10741 if (IsInteger) {
10742 if (X.IsSigned)
10743 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10745 else
10746 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10748 } else {
10749 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10751 }
10752 }
10753
10754 AtomicRMWInst *OldValue =
10755 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10756 if (V.Var) {
10757 Value *CapturedValue = nullptr;
10758 if (IsPostfixUpdate) {
10759 CapturedValue = OldValue;
10760 } else {
10761 CmpInst::Predicate Pred;
10762 switch (NewOp) {
10763 case AtomicRMWInst::Max:
10764 Pred = CmpInst::ICMP_SGT;
10765 break;
10767 Pred = CmpInst::ICMP_UGT;
10768 break;
10770 Pred = CmpInst::FCMP_OGT;
10771 break;
10772 case AtomicRMWInst::Min:
10773 Pred = CmpInst::ICMP_SLT;
10774 break;
10776 Pred = CmpInst::ICMP_ULT;
10777 break;
10779 Pred = CmpInst::FCMP_OLT;
10780 break;
10781 default:
10782 llvm_unreachable("unexpected comparison op");
10783 }
10784 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10785 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10786 }
10787 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10788 }
10789 }
10790
10791 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10792
10793 return Builder.saveIP();
10794}
10795
10798 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10799 Value *NumTeamsUpper, Value *ThreadLimit,
10800 Value *IfExpr) {
10801 if (!updateToLocation(Loc))
10802 return InsertPointTy();
10803
10804 uint32_t SrcLocStrSize;
10805 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10806 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10807 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10808
10809 // Outer allocation basicblock is the entry block of the current function.
10810 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10811 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10812 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10813 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10814 }
10815
10816 // The current basic block is split into four basic blocks. After outlining,
10817 // they will be mapped as follows:
10818 // ```
10819 // def current_fn() {
10820 // current_basic_block:
10821 // br label %teams.exit
10822 // teams.exit:
10823 // ; instructions after teams
10824 // }
10825 //
10826 // def outlined_fn() {
10827 // teams.alloca:
10828 // br label %teams.body
10829 // teams.body:
10830 // ; instructions within teams body
10831 // }
10832 // ```
10833 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10834 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10835 BasicBlock *AllocaBB =
10836 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10837
10838 bool SubClausesPresent =
10839 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10840 // Push num_teams
10841 if (!Config.isTargetDevice() && SubClausesPresent) {
10842 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10843 "if lowerbound is non-null, then upperbound must also be non-null "
10844 "for bounds on num_teams");
10845
10846 if (NumTeamsUpper == nullptr)
10847 NumTeamsUpper = Builder.getInt32(0);
10848
10849 if (NumTeamsLower == nullptr)
10850 NumTeamsLower = NumTeamsUpper;
10851
10852 if (IfExpr) {
10853 assert(IfExpr->getType()->isIntegerTy() &&
10854 "argument to if clause must be an integer value");
10855
10856 // upper = ifexpr ? upper : 1
10857 if (IfExpr->getType() != Int1)
10858 IfExpr = Builder.CreateICmpNE(IfExpr,
10859 ConstantInt::get(IfExpr->getType(), 0));
10860 NumTeamsUpper = Builder.CreateSelect(
10861 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10862
10863 // lower = ifexpr ? lower : 1
10864 NumTeamsLower = Builder.CreateSelect(
10865 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10866 }
10867
10868 if (ThreadLimit == nullptr)
10869 ThreadLimit = Builder.getInt32(0);
10870
10871 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
10872 // truncate or sign extend the passed values to match the int32 parameters.
10873 Value *NumTeamsLowerInt32 =
10874 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
10875 Value *NumTeamsUpperInt32 =
10876 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
10877 Value *ThreadLimitInt32 =
10878 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
10879
10880 Value *ThreadNum = getOrCreateThreadID(Ident);
10881
10883 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10884 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
10885 ThreadLimitInt32});
10886 }
10887 // Generate the body of teams.
10888 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10889 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10890 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10891 return Err;
10892
10893 OutlineInfo OI;
10894 OI.EntryBB = AllocaBB;
10895 OI.ExitBB = ExitBB;
10896 OI.OuterAllocaBB = &OuterAllocaBB;
10897
10898 // Insert fake values for global tid and bound tid.
10900 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10902 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10904 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10905
10906 auto HostPostOutlineCB = [this, Ident,
10907 ToBeDeleted](Function &OutlinedFn) mutable {
10908 // The stale call instruction will be replaced with a new call instruction
10909 // for runtime call with the outlined function.
10910
10911 assert(OutlinedFn.hasOneUse() &&
10912 "there must be a single user for the outlined function");
10913 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10914 ToBeDeleted.push_back(StaleCI);
10915
10916 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10917 "Outlined function must have two or three arguments only");
10918
10919 bool HasShared = OutlinedFn.arg_size() == 3;
10920
10921 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10922 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10923 if (HasShared)
10924 OutlinedFn.getArg(2)->setName("data");
10925
10926 // Call to the runtime function for teams in the current function.
10927 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10928 "outlined function.");
10929 Builder.SetInsertPoint(StaleCI);
10930 SmallVector<Value *> Args = {
10931 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10932 if (HasShared)
10933 Args.push_back(StaleCI->getArgOperand(2));
10936 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10937 Args);
10938
10939 for (Instruction *I : llvm::reverse(ToBeDeleted))
10940 I->eraseFromParent();
10941 };
10942
10943 if (!Config.isTargetDevice())
10944 OI.PostOutlineCB = HostPostOutlineCB;
10945
10946 addOutlineInfo(std::move(OI));
10947
10948 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10949
10950 return Builder.saveIP();
10951}
10952
10955 InsertPointTy OuterAllocaIP,
10956 BodyGenCallbackTy BodyGenCB) {
10957 if (!updateToLocation(Loc))
10958 return InsertPointTy();
10959
10960 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10961
10962 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10963 BasicBlock *BodyBB =
10964 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10965 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10966 }
10967 BasicBlock *ExitBB =
10968 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10969 BasicBlock *BodyBB =
10970 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10971 BasicBlock *AllocaBB =
10972 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10973
10974 // Generate the body of distribute clause
10975 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10976 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10977 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10978 return Err;
10979
10980 // When using target we use different runtime functions which require a
10981 // callback.
10982 if (Config.isTargetDevice()) {
10983 OutlineInfo OI;
10984 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10985 OI.EntryBB = AllocaBB;
10986 OI.ExitBB = ExitBB;
10987
10988 addOutlineInfo(std::move(OI));
10989 }
10990 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10991
10992 return Builder.saveIP();
10993}
10994
10997 std::string VarName) {
10998 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11000 Names.size()),
11001 Names);
11002 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11003 M, MapNamesArrayInit->getType(),
11004 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11005 VarName);
11006 return MapNamesArrayGlobal;
11007}
11008
11009// Create all simple and struct types exposed by the runtime and remember
11010// the llvm::PointerTypes of them for easy access later.
11011void OpenMPIRBuilder::initializeTypes(Module &M) {
11012 LLVMContext &Ctx = M.getContext();
11013 StructType *T;
11014 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11015 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11016#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11017#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11018 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11019 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11020#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11021 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11022 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11023#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11024 T = StructType::getTypeByName(Ctx, StructName); \
11025 if (!T) \
11026 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11027 VarName = T; \
11028 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11029#include "llvm/Frontend/OpenMP/OMPKinds.def"
11030}
11031
11034 SmallVectorImpl<BasicBlock *> &BlockVector) {
11036 BlockSet.insert(EntryBB);
11037 BlockSet.insert(ExitBB);
11038
11039 Worklist.push_back(EntryBB);
11040 while (!Worklist.empty()) {
11041 BasicBlock *BB = Worklist.pop_back_val();
11042 BlockVector.push_back(BB);
11043 for (BasicBlock *SuccBB : successors(BB))
11044 if (BlockSet.insert(SuccBB).second)
11045 Worklist.push_back(SuccBB);
11046 }
11047}
11048
11050 uint64_t Size, int32_t Flags,
11052 StringRef Name) {
11053 if (!Config.isGPU()) {
11056 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11057 return;
11058 }
11059 // TODO: Add support for global variables on the device after declare target
11060 // support.
11061 Function *Fn = dyn_cast<Function>(Addr);
11062 if (!Fn)
11063 return;
11064
11065 // Add a function attribute for the kernel.
11066 Fn->addFnAttr("kernel");
11067 if (T.isAMDGCN())
11068 Fn->addFnAttr("uniform-work-group-size");
11069 Fn->addFnAttr(Attribute::MustProgress);
11070}
11071
11072// We only generate metadata for function that contain target regions.
11075
11076 // If there are no entries, we don't need to do anything.
11077 if (OffloadInfoManager.empty())
11078 return;
11079
11080 LLVMContext &C = M.getContext();
11083 16>
11084 OrderedEntries(OffloadInfoManager.size());
11085
11086 // Auxiliary methods to create metadata values and strings.
11087 auto &&GetMDInt = [this](unsigned V) {
11088 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11089 };
11090
11091 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11092
11093 // Create the offloading info metadata node.
11094 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11095 auto &&TargetRegionMetadataEmitter =
11096 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11097 const TargetRegionEntryInfo &EntryInfo,
11099 // Generate metadata for target regions. Each entry of this metadata
11100 // contains:
11101 // - Entry 0 -> Kind of this type of metadata (0).
11102 // - Entry 1 -> Device ID of the file where the entry was identified.
11103 // - Entry 2 -> File ID of the file where the entry was identified.
11104 // - Entry 3 -> Mangled name of the function where the entry was
11105 // identified.
11106 // - Entry 4 -> Line in the file where the entry was identified.
11107 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11108 // - Entry 6 -> Order the entry was created.
11109 // The first element of the metadata node is the kind.
11110 Metadata *Ops[] = {
11111 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11112 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11113 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11114 GetMDInt(E.getOrder())};
11115
11116 // Save this entry in the right position of the ordered entries array.
11117 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11118
11119 // Add metadata to the named metadata node.
11120 MD->addOperand(MDNode::get(C, Ops));
11121 };
11122
11123 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11124
11125 // Create function that emits metadata for each device global variable entry;
11126 auto &&DeviceGlobalVarMetadataEmitter =
11127 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11128 StringRef MangledName,
11130 // Generate metadata for global variables. Each entry of this metadata
11131 // contains:
11132 // - Entry 0 -> Kind of this type of metadata (1).
11133 // - Entry 1 -> Mangled name of the variable.
11134 // - Entry 2 -> Declare target kind.
11135 // - Entry 3 -> Order the entry was created.
11136 // The first element of the metadata node is the kind.
11137 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11138 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11139
11140 // Save this entry in the right position of the ordered entries array.
11141 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11142 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11143
11144 // Add metadata to the named metadata node.
11145 MD->addOperand(MDNode::get(C, Ops));
11146 };
11147
11148 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11149 DeviceGlobalVarMetadataEmitter);
11150
11151 for (const auto &E : OrderedEntries) {
11152 assert(E.first && "All ordered entries must exist!");
11153 if (const auto *CE =
11155 E.first)) {
11156 if (!CE->getID() || !CE->getAddress()) {
11157 // Do not blame the entry if the parent funtion is not emitted.
11158 TargetRegionEntryInfo EntryInfo = E.second;
11159 StringRef FnName = EntryInfo.ParentName;
11160 if (!M.getNamedValue(FnName))
11161 continue;
11162 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11163 continue;
11164 }
11165 createOffloadEntry(CE->getID(), CE->getAddress(),
11166 /*Size=*/0, CE->getFlags(),
11168 } else if (const auto *CE = dyn_cast<
11170 E.first)) {
11173 CE->getFlags());
11174 switch (Flags) {
11177 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11178 continue;
11179 if (!CE->getAddress()) {
11180 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11181 continue;
11182 }
11183 // The vaiable has no definition - no need to add the entry.
11184 if (CE->getVarSize() == 0)
11185 continue;
11186 break;
11188 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11189 (!Config.isTargetDevice() && CE->getAddress())) &&
11190 "Declaret target link address is set.");
11191 if (Config.isTargetDevice())
11192 continue;
11193 if (!CE->getAddress()) {
11195 continue;
11196 }
11197 break;
11200 if (!CE->getAddress()) {
11201 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11202 continue;
11203 }
11204 break;
11205 default:
11206 break;
11207 }
11208
11209 // Hidden or internal symbols on the device are not externally visible.
11210 // We should not attempt to register them by creating an offloading
11211 // entry. Indirect variables are handled separately on the device.
11212 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11213 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11214 (Flags !=
11216 Flags != OffloadEntriesInfoManager::
11217 OMPTargetGlobalVarEntryIndirectVTable))
11218 continue;
11219
11220 // Indirect globals need to use a special name that doesn't match the name
11221 // of the associated host global.
11223 Flags ==
11225 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11226 Flags, CE->getLinkage(), CE->getVarName());
11227 else
11228 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11229 Flags, CE->getLinkage());
11230
11231 } else {
11232 llvm_unreachable("Unsupported entry kind.");
11233 }
11234 }
11235
11236 // Emit requires directive globals to a special entry so the runtime can
11237 // register them when the device image is loaded.
11238 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11239 // entries should be redesigned to better suit this use-case.
11240 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11244 ".requires", /*Size=*/0,
11246 Config.getRequiresFlags());
11247}
11248
11251 unsigned FileID, unsigned Line, unsigned Count) {
11252 raw_svector_ostream OS(Name);
11253 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11254 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11255 if (Count)
11256 OS << "_" << Count;
11257}
11258
11260 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11261 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11263 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11264 EntryInfo.Line, NewCount);
11265}
11266
11269 vfs::FileSystem &VFS,
11270 StringRef ParentName) {
11271 sys::fs::UniqueID ID(0xdeadf17e, 0);
11272 auto FileIDInfo = CallBack();
11273 uint64_t FileID = 0;
11274 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11275 ID = Status->getUniqueID();
11276 FileID = Status->getUniqueID().getFile();
11277 } else {
11278 // If the inode ID could not be determined, create a hash value
11279 // the current file name and use that as an ID.
11280 FileID = hash_value(std::get<0>(FileIDInfo));
11281 }
11282
11283 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11284 std::get<1>(FileIDInfo));
11285}
11286
11288 unsigned Offset = 0;
11289 for (uint64_t Remain =
11290 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11292 !(Remain & 1); Remain = Remain >> 1)
11293 Offset++;
11294 return Offset;
11295}
11296
11299 // Rotate by getFlagMemberOffset() bits.
11300 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11301 << getFlagMemberOffset());
11302}
11303
11306 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11307 // If the entry is PTR_AND_OBJ but has not been marked with the special
11308 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11309 // marked as MEMBER_OF.
11310 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11312 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11315 return;
11316
11317 // Entries with ATTACH are not members-of anything. They are handled
11318 // separately by the runtime after other maps have been handled.
11319 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11321 return;
11322
11323 // Reset the placeholder value to prepare the flag for the assignment of the
11324 // proper MEMBER_OF value.
11325 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11326 Flags |= MemberOfFlag;
11327}
11328
11332 bool IsDeclaration, bool IsExternallyVisible,
11333 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11334 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11335 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11336 std::function<Constant *()> GlobalInitializer,
11337 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11338 // TODO: convert this to utilise the IRBuilder Config rather than
11339 // a passed down argument.
11340 if (OpenMPSIMD)
11341 return nullptr;
11342
11345 CaptureClause ==
11347 Config.hasRequiresUnifiedSharedMemory())) {
11348 SmallString<64> PtrName;
11349 {
11350 raw_svector_ostream OS(PtrName);
11351 OS << MangledName;
11352 if (!IsExternallyVisible)
11353 OS << format("_%x", EntryInfo.FileID);
11354 OS << "_decl_tgt_ref_ptr";
11355 }
11356
11357 Value *Ptr = M.getNamedValue(PtrName);
11358
11359 if (!Ptr) {
11360 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11361 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11362
11363 auto *GV = cast<GlobalVariable>(Ptr);
11364 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11365
11366 if (!Config.isTargetDevice()) {
11367 if (GlobalInitializer)
11368 GV->setInitializer(GlobalInitializer());
11369 else
11370 GV->setInitializer(GlobalValue);
11371 }
11372
11374 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11375 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11376 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11377 }
11378
11379 return cast<Constant>(Ptr);
11380 }
11381
11382 return nullptr;
11383}
11384
11388 bool IsDeclaration, bool IsExternallyVisible,
11389 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11390 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11391 std::vector<Triple> TargetTriple,
11392 std::function<Constant *()> GlobalInitializer,
11393 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11394 Constant *Addr) {
11396 (TargetTriple.empty() && !Config.isTargetDevice()))
11397 return;
11398
11400 StringRef VarName;
11401 int64_t VarSize;
11403
11405 CaptureClause ==
11407 !Config.hasRequiresUnifiedSharedMemory()) {
11409 VarName = MangledName;
11410 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11411
11412 if (!IsDeclaration)
11413 VarSize = divideCeil(
11414 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11415 else
11416 VarSize = 0;
11417 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11418
11419 // This is a workaround carried over from Clang which prevents undesired
11420 // optimisation of internal variables.
11421 if (Config.isTargetDevice() &&
11422 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11423 // Do not create a "ref-variable" if the original is not also available
11424 // on the host.
11425 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11426 return;
11427
11428 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11429
11430 if (!M.getNamedValue(RefName)) {
11431 Constant *AddrRef =
11432 getOrCreateInternalVariable(Addr->getType(), RefName);
11433 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11434 GvAddrRef->setConstant(true);
11435 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11436 GvAddrRef->setInitializer(Addr);
11437 GeneratedRefs.push_back(GvAddrRef);
11438 }
11439 }
11440 } else {
11443 else
11445
11446 if (Config.isTargetDevice()) {
11447 VarName = (Addr) ? Addr->getName() : "";
11448 Addr = nullptr;
11449 } else {
11451 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11452 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11453 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11454 VarName = (Addr) ? Addr->getName() : "";
11455 }
11456 VarSize = M.getDataLayout().getPointerSize();
11458 }
11459
11460 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11461 Flags, Linkage);
11462}
11463
11464/// Loads all the offload entries information from the host IR
11465/// metadata.
11467 // If we are in target mode, load the metadata from the host IR. This code has
11468 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11469
11470 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11471 if (!MD)
11472 return;
11473
11474 for (MDNode *MN : MD->operands()) {
11475 auto &&GetMDInt = [MN](unsigned Idx) {
11476 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11477 return cast<ConstantInt>(V->getValue())->getZExtValue();
11478 };
11479
11480 auto &&GetMDString = [MN](unsigned Idx) {
11481 auto *V = cast<MDString>(MN->getOperand(Idx));
11482 return V->getString();
11483 };
11484
11485 switch (GetMDInt(0)) {
11486 default:
11487 llvm_unreachable("Unexpected metadata!");
11488 break;
11489 case OffloadEntriesInfoManager::OffloadEntryInfo::
11490 OffloadingEntryInfoTargetRegion: {
11491 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11492 /*DeviceID=*/GetMDInt(1),
11493 /*FileID=*/GetMDInt(2),
11494 /*Line=*/GetMDInt(4),
11495 /*Count=*/GetMDInt(5));
11496 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11497 /*Order=*/GetMDInt(6));
11498 break;
11499 }
11500 case OffloadEntriesInfoManager::OffloadEntryInfo::
11501 OffloadingEntryInfoDeviceGlobalVar:
11502 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11503 /*MangledName=*/GetMDString(1),
11505 /*Flags=*/GetMDInt(2)),
11506 /*Order=*/GetMDInt(3));
11507 break;
11508 }
11509 }
11510}
11511
11513 StringRef HostFilePath) {
11514 if (HostFilePath.empty())
11515 return;
11516
11517 auto Buf = VFS.getBufferForFile(HostFilePath);
11518 if (std::error_code Err = Buf.getError()) {
11519 report_fatal_error(("error opening host file from host file path inside of "
11520 "OpenMPIRBuilder: " +
11521 Err.message())
11522 .c_str());
11523 }
11524
11525 LLVMContext Ctx;
11527 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11528 if (std::error_code Err = M.getError()) {
11530 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11531 .c_str());
11532 }
11533
11534 loadOffloadInfoMetadata(*M.get());
11535}
11536
11537//===----------------------------------------------------------------------===//
11538// OffloadEntriesInfoManager
11539//===----------------------------------------------------------------------===//
11540
11542 return OffloadEntriesTargetRegion.empty() &&
11543 OffloadEntriesDeviceGlobalVar.empty();
11544}
11545
11546unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11547 const TargetRegionEntryInfo &EntryInfo) const {
11548 auto It = OffloadEntriesTargetRegionCount.find(
11549 getTargetRegionEntryCountKey(EntryInfo));
11550 if (It == OffloadEntriesTargetRegionCount.end())
11551 return 0;
11552 return It->second;
11553}
11554
11555void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11556 const TargetRegionEntryInfo &EntryInfo) {
11557 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11558 EntryInfo.Count + 1;
11559}
11560
11561/// Initialize target region entry.
11563 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11564 OffloadEntriesTargetRegion[EntryInfo] =
11565 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11567 ++OffloadingEntriesNum;
11568}
11569
11571 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11573 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11574
11575 // Update the EntryInfo with the next available count for this location.
11576 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11577
11578 // If we are emitting code for a target, the entry is already initialized,
11579 // only has to be registered.
11580 if (OMPBuilder->Config.isTargetDevice()) {
11581 // This could happen if the device compilation is invoked standalone.
11582 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11583 return;
11584 }
11585 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11586 Entry.setAddress(Addr);
11587 Entry.setID(ID);
11588 Entry.setFlags(Flags);
11589 } else {
11591 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11592 return;
11593 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11594 "Target region entry already registered!");
11595 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11596 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11597 ++OffloadingEntriesNum;
11598 }
11599 incrementTargetRegionEntryInfoCount(EntryInfo);
11600}
11601
11603 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11604
11605 // Update the EntryInfo with the next available count for this location.
11606 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11607
11608 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11609 if (It == OffloadEntriesTargetRegion.end()) {
11610 return false;
11611 }
11612 // Fail if this entry is already registered.
11613 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11614 return false;
11615 return true;
11616}
11617
11619 const OffloadTargetRegionEntryInfoActTy &Action) {
11620 // Scan all target region entries and perform the provided action.
11621 for (const auto &It : OffloadEntriesTargetRegion) {
11622 Action(It.first, It.second);
11623 }
11624}
11625
11627 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11628 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11629 ++OffloadingEntriesNum;
11630}
11631
11633 StringRef VarName, Constant *Addr, int64_t VarSize,
11635 if (OMPBuilder->Config.isTargetDevice()) {
11636 // This could happen if the device compilation is invoked standalone.
11637 if (!hasDeviceGlobalVarEntryInfo(VarName))
11638 return;
11639 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11640 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11641 if (Entry.getVarSize() == 0) {
11642 Entry.setVarSize(VarSize);
11643 Entry.setLinkage(Linkage);
11644 }
11645 return;
11646 }
11647 Entry.setVarSize(VarSize);
11648 Entry.setLinkage(Linkage);
11649 Entry.setAddress(Addr);
11650 } else {
11651 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11652 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11653 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11654 "Entry not initialized!");
11655 if (Entry.getVarSize() == 0) {
11656 Entry.setVarSize(VarSize);
11657 Entry.setLinkage(Linkage);
11658 }
11659 return;
11660 }
11662 Flags ==
11664 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11665 Addr, VarSize, Flags, Linkage,
11666 VarName.str());
11667 else
11668 OffloadEntriesDeviceGlobalVar.try_emplace(
11669 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
11670 ++OffloadingEntriesNum;
11671 }
11672}
11673
11676 // Scan all target region entries and perform the provided action.
11677 for (const auto &E : OffloadEntriesDeviceGlobalVar)
11678 Action(E.getKey(), E.getValue());
11679}
11680
11681//===----------------------------------------------------------------------===//
11682// CanonicalLoopInfo
11683//===----------------------------------------------------------------------===//
11684
11685void CanonicalLoopInfo::collectControlBlocks(
11687 // We only count those BBs as control block for which we do not need to
11688 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
11689 // flow. For consistency, this also means we do not add the Body block, which
11690 // is just the entry to the body code.
11691 BBs.reserve(BBs.size() + 6);
11692 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
11693}
11694
11696 assert(isValid() && "Requires a valid canonical loop");
11697 for (BasicBlock *Pred : predecessors(Header)) {
11698 if (Pred != Latch)
11699 return Pred;
11700 }
11701 llvm_unreachable("Missing preheader");
11702}
11703
11704void CanonicalLoopInfo::setTripCount(Value *TripCount) {
11705 assert(isValid() && "Requires a valid canonical loop");
11706
11707 Instruction *CmpI = &getCond()->front();
11708 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
11709 CmpI->setOperand(1, TripCount);
11710
11711#ifndef NDEBUG
11712 assertOK();
11713#endif
11714}
11715
11716void CanonicalLoopInfo::mapIndVar(
11717 llvm::function_ref<Value *(Instruction *)> Updater) {
11718 assert(isValid() && "Requires a valid canonical loop");
11719
11720 Instruction *OldIV = getIndVar();
11721
11722 // Record all uses excluding those introduced by the updater. Uses by the
11723 // CanonicalLoopInfo itself to keep track of the number of iterations are
11724 // excluded.
11725 SmallVector<Use *> ReplacableUses;
11726 for (Use &U : OldIV->uses()) {
11727 auto *User = dyn_cast<Instruction>(U.getUser());
11728 if (!User)
11729 continue;
11730 if (User->getParent() == getCond())
11731 continue;
11732 if (User->getParent() == getLatch())
11733 continue;
11734 ReplacableUses.push_back(&U);
11735 }
11736
11737 // Run the updater that may introduce new uses
11738 Value *NewIV = Updater(OldIV);
11739
11740 // Replace the old uses with the value returned by the updater.
11741 for (Use *U : ReplacableUses)
11742 U->set(NewIV);
11743
11744#ifndef NDEBUG
11745 assertOK();
11746#endif
11747}
11748
11750#ifndef NDEBUG
11751 // No constraints if this object currently does not describe a loop.
11752 if (!isValid())
11753 return;
11754
11755 BasicBlock *Preheader = getPreheader();
11756 BasicBlock *Body = getBody();
11757 BasicBlock *After = getAfter();
11758
11759 // Verify standard control-flow we use for OpenMP loops.
11760 assert(Preheader);
11761 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11762 "Preheader must terminate with unconditional branch");
11763 assert(Preheader->getSingleSuccessor() == Header &&
11764 "Preheader must jump to header");
11765
11766 assert(Header);
11767 assert(isa<BranchInst>(Header->getTerminator()) &&
11768 "Header must terminate with unconditional branch");
11769 assert(Header->getSingleSuccessor() == Cond &&
11770 "Header must jump to exiting block");
11771
11772 assert(Cond);
11773 assert(Cond->getSinglePredecessor() == Header &&
11774 "Exiting block only reachable from header");
11775
11776 assert(isa<BranchInst>(Cond->getTerminator()) &&
11777 "Exiting block must terminate with conditional branch");
11778 assert(size(successors(Cond)) == 2 &&
11779 "Exiting block must have two successors");
11780 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11781 "Exiting block's first successor jump to the body");
11782 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11783 "Exiting block's second successor must exit the loop");
11784
11785 assert(Body);
11786 assert(Body->getSinglePredecessor() == Cond &&
11787 "Body only reachable from exiting block");
11788 assert(!isa<PHINode>(Body->front()));
11789
11790 assert(Latch);
11791 assert(isa<BranchInst>(Latch->getTerminator()) &&
11792 "Latch must terminate with unconditional branch");
11793 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11794 // TODO: To support simple redirecting of the end of the body code that has
11795 // multiple; introduce another auxiliary basic block like preheader and after.
11796 assert(Latch->getSinglePredecessor() != nullptr);
11797 assert(!isa<PHINode>(Latch->front()));
11798
11799 assert(Exit);
11800 assert(isa<BranchInst>(Exit->getTerminator()) &&
11801 "Exit block must terminate with unconditional branch");
11802 assert(Exit->getSingleSuccessor() == After &&
11803 "Exit block must jump to after block");
11804
11805 assert(After);
11806 assert(After->getSinglePredecessor() == Exit &&
11807 "After block only reachable from exit block");
11808 assert(After->empty() || !isa<PHINode>(After->front()));
11809
11810 Instruction *IndVar = getIndVar();
11811 assert(IndVar && "Canonical induction variable not found?");
11812 assert(isa<IntegerType>(IndVar->getType()) &&
11813 "Induction variable must be an integer");
11814 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11815 "Induction variable must be a PHI in the loop header");
11816 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11817 assert(
11818 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11819 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11820
11821 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11822 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11823 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11824 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11825 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11826 ->isOne());
11827
11828 Value *TripCount = getTripCount();
11829 assert(TripCount && "Loop trip count not found?");
11830 assert(IndVar->getType() == TripCount->getType() &&
11831 "Trip count and induction variable must have the same type");
11832
11833 auto *CmpI = cast<CmpInst>(&Cond->front());
11834 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11835 "Exit condition must be a signed less-than comparison");
11836 assert(CmpI->getOperand(0) == IndVar &&
11837 "Exit condition must compare the induction variable");
11838 assert(CmpI->getOperand(1) == TripCount &&
11839 "Exit condition must compare with the trip count");
11840#endif
11841}
11842
11844 Header = nullptr;
11845 Cond = nullptr;
11846 Latch = nullptr;
11847 Exit = nullptr;
11848}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:486
bool empty() const
Definition BasicBlock.h:492
const Instruction & back() const
Definition BasicBlock.h:495
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:493
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:488
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:397
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:668
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:568
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:639
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:446
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:667
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2787
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:991
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1092
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1154
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1170
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:962
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:708
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:370
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:302
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
a struct to pack relevant information while generating atomic Ops
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...