LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
64
65#include <cstdint>
66#include <optional>
67
68#define DEBUG_TYPE "openmp-ir-builder"
69
70using namespace llvm;
71using namespace omp;
72
73static cl::opt<bool>
74 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
75 cl::desc("Use optimistic attributes describing "
76 "'as-if' properties of runtime calls."),
77 cl::init(false));
78
80 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
81 cl::desc("Factor for the unroll threshold to account for code "
82 "simplifications still taking place"),
83 cl::init(1.5));
84
85#ifndef NDEBUG
86/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
87/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
88/// an InsertPoint stores the instruction before something is inserted. For
89/// instance, if both point to the same instruction, two IRBuilders alternating
90/// creating instruction will cause the instructions to be interleaved.
93 if (!IP1.isSet() || !IP2.isSet())
94 return false;
95 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
96}
97
99 // Valid ordered/unordered and base algorithm combinations.
100 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
101 case OMPScheduleType::UnorderedStaticChunked:
102 case OMPScheduleType::UnorderedStatic:
103 case OMPScheduleType::UnorderedDynamicChunked:
104 case OMPScheduleType::UnorderedGuidedChunked:
105 case OMPScheduleType::UnorderedRuntime:
106 case OMPScheduleType::UnorderedAuto:
107 case OMPScheduleType::UnorderedTrapezoidal:
108 case OMPScheduleType::UnorderedGreedy:
109 case OMPScheduleType::UnorderedBalanced:
110 case OMPScheduleType::UnorderedGuidedIterativeChunked:
111 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
112 case OMPScheduleType::UnorderedSteal:
113 case OMPScheduleType::UnorderedStaticBalancedChunked:
114 case OMPScheduleType::UnorderedGuidedSimd:
115 case OMPScheduleType::UnorderedRuntimeSimd:
116 case OMPScheduleType::OrderedStaticChunked:
117 case OMPScheduleType::OrderedStatic:
118 case OMPScheduleType::OrderedDynamicChunked:
119 case OMPScheduleType::OrderedGuidedChunked:
120 case OMPScheduleType::OrderedRuntime:
121 case OMPScheduleType::OrderedAuto:
122 case OMPScheduleType::OrderdTrapezoidal:
123 case OMPScheduleType::NomergeUnorderedStaticChunked:
124 case OMPScheduleType::NomergeUnorderedStatic:
125 case OMPScheduleType::NomergeUnorderedDynamicChunked:
126 case OMPScheduleType::NomergeUnorderedGuidedChunked:
127 case OMPScheduleType::NomergeUnorderedRuntime:
128 case OMPScheduleType::NomergeUnorderedAuto:
129 case OMPScheduleType::NomergeUnorderedTrapezoidal:
130 case OMPScheduleType::NomergeUnorderedGreedy:
131 case OMPScheduleType::NomergeUnorderedBalanced:
132 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
133 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
134 case OMPScheduleType::NomergeUnorderedSteal:
135 case OMPScheduleType::NomergeOrderedStaticChunked:
136 case OMPScheduleType::NomergeOrderedStatic:
137 case OMPScheduleType::NomergeOrderedDynamicChunked:
138 case OMPScheduleType::NomergeOrderedGuidedChunked:
139 case OMPScheduleType::NomergeOrderedRuntime:
140 case OMPScheduleType::NomergeOrderedAuto:
141 case OMPScheduleType::NomergeOrderedTrapezoidal:
142 case OMPScheduleType::OrderedDistributeChunked:
143 case OMPScheduleType::OrderedDistribute:
144 break;
145 default:
146 return false;
147 }
148
149 // Must not set both monotonicity modifiers at the same time.
150 OMPScheduleType MonotonicityFlags =
151 SchedType & OMPScheduleType::MonotonicityMask;
152 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
153 return false;
154
155 return true;
156}
157#endif
158
159/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
160/// debug location to the last instruction in the specified basic block if the
161/// insert point points to the end of the block.
164 Builder.restoreIP(IP);
165 llvm::BasicBlock *BB = Builder.GetInsertBlock();
166 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
167 if (!BB->empty() && I == BB->end())
168 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
169}
170
171static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
172 if (T.isAMDGPU()) {
173 StringRef Features =
174 Kernel->getFnAttribute("target-features").getValueAsString();
175 if (Features.count("+wavefrontsize64"))
178 }
179 if (T.isNVPTX())
181 if (T.isSPIRV())
183 llvm_unreachable("No grid value available for this architecture!");
184}
185
186/// Determine which scheduling algorithm to use, determined from schedule clause
187/// arguments.
188static OMPScheduleType
189getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
190 bool HasSimdModifier, bool HasDistScheduleChunks) {
191 // Currently, the default schedule it static.
192 switch (ClauseKind) {
193 case OMP_SCHEDULE_Default:
194 case OMP_SCHEDULE_Static:
195 return HasChunks ? OMPScheduleType::BaseStaticChunked
196 : OMPScheduleType::BaseStatic;
197 case OMP_SCHEDULE_Dynamic:
198 return OMPScheduleType::BaseDynamicChunked;
199 case OMP_SCHEDULE_Guided:
200 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
201 : OMPScheduleType::BaseGuidedChunked;
202 case OMP_SCHEDULE_Auto:
204 case OMP_SCHEDULE_Runtime:
205 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
206 : OMPScheduleType::BaseRuntime;
207 case OMP_SCHEDULE_Distribute:
208 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
209 : OMPScheduleType::BaseDistribute;
210 }
211 llvm_unreachable("unhandled schedule clause argument");
212}
213
214/// Adds ordering modifier flags to schedule type.
215static OMPScheduleType
217 bool HasOrderedClause) {
218 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
219 OMPScheduleType::None &&
220 "Must not have ordering nor monotonicity flags already set");
221
222 OMPScheduleType OrderingModifier = HasOrderedClause
223 ? OMPScheduleType::ModifierOrdered
224 : OMPScheduleType::ModifierUnordered;
225 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
226
227 // Unsupported combinations
228 if (OrderingScheduleType ==
229 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
230 return OMPScheduleType::OrderedGuidedChunked;
231 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
232 OMPScheduleType::ModifierOrdered))
233 return OMPScheduleType::OrderedRuntime;
234
235 return OrderingScheduleType;
236}
237
238/// Adds monotonicity modifier flags to schedule type.
239static OMPScheduleType
241 bool HasSimdModifier, bool HasMonotonic,
242 bool HasNonmonotonic, bool HasOrderedClause) {
243 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
244 OMPScheduleType::None &&
245 "Must not have monotonicity flags already set");
246 assert((!HasMonotonic || !HasNonmonotonic) &&
247 "Monotonic and Nonmonotonic are contradicting each other");
248
249 if (HasMonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierMonotonic;
251 } else if (HasNonmonotonic) {
252 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
253 } else {
254 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
255 // If the static schedule kind is specified or if the ordered clause is
256 // specified, and if the nonmonotonic modifier is not specified, the
257 // effect is as if the monotonic modifier is specified. Otherwise, unless
258 // the monotonic modifier is specified, the effect is as if the
259 // nonmonotonic modifier is specified.
260 OMPScheduleType BaseScheduleType =
261 ScheduleType & ~OMPScheduleType::ModifierMask;
262 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
263 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
264 HasOrderedClause) {
265 // The monotonic is used by default in openmp runtime library, so no need
266 // to set it.
267 return ScheduleType;
268 } else {
269 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
270 }
271 }
272}
273
274/// Determine the schedule type using schedule and ordering clause arguments.
275static OMPScheduleType
276computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
277 bool HasSimdModifier, bool HasMonotonicModifier,
278 bool HasNonmonotonicModifier, bool HasOrderedClause,
279 bool HasDistScheduleChunks) {
281 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
282 OMPScheduleType OrderedSchedule =
283 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
285 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
286 HasNonmonotonicModifier, HasOrderedClause);
287
289 return Result;
290}
291
292/// Make \p Source branch to \p Target.
293///
294/// Handles two situations:
295/// * \p Source already has an unconditional branch.
296/// * \p Source is a degenerate block (no terminator because the BB is
297/// the current head of the IR construction).
299 if (Instruction *Term = Source->getTerminator()) {
300 auto *Br = cast<UncondBrInst>(Term);
301 BasicBlock *Succ = Br->getSuccessor();
302 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
303 Br->setSuccessor(Target);
304 return;
305 }
306
307 auto *NewBr = UncondBrInst::Create(Target, Source);
308 NewBr->setDebugLoc(DL);
309}
310
312 bool CreateBranch, DebugLoc DL) {
313 assert(New->getFirstInsertionPt() == New->begin() &&
314 "Target BB must not have PHI nodes");
315
316 // Move instructions to new block.
317 BasicBlock *Old = IP.getBlock();
318 // If the `Old` block is empty then there are no instructions to move. But in
319 // the new debug scheme, it could have trailing debug records which will be
320 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
321 // reasons:
322 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
323 // 2. Even if `New` is not empty, the rationale to move those records to `New`
324 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
325 // assumes that `Old` is optimized out and is going away. This is not the case
326 // here. The `Old` block is still being used e.g. a branch instruction is
327 // added to it later in this function.
328 // So we call `BasicBlock::splice` only when `Old` is not empty.
329 if (!Old->empty())
330 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
331
332 if (CreateBranch) {
333 auto *NewBr = UncondBrInst::Create(New, Old);
334 NewBr->setDebugLoc(DL);
335 }
336}
337
338void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
339 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
340 BasicBlock *Old = Builder.GetInsertBlock();
341
342 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
343 if (CreateBranch)
344 Builder.SetInsertPoint(Old->getTerminator());
345 else
346 Builder.SetInsertPoint(Old);
347
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
350 Builder.SetCurrentDebugLocation(DebugLoc);
351}
352
354 DebugLoc DL, llvm::Twine Name) {
355 BasicBlock *Old = IP.getBlock();
357 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
358 Old->getParent(), Old->getNextNode());
359 spliceBB(IP, New, CreateBranch, DL);
360 New->replaceSuccessorsPhiUsesWith(Old, New);
361 return New;
362}
363
364BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
365 llvm::Twine Name) {
366 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
367 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
368 if (CreateBranch)
369 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
370 else
371 Builder.SetInsertPoint(Builder.GetInsertBlock());
372 // SetInsertPoint also updates the Builder's debug location, but we want to
373 // keep the one the Builder was configured to use.
374 Builder.SetCurrentDebugLocation(DebugLoc);
375 return New;
376}
377
378BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
379 llvm::Twine Name) {
380 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
381 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
382 if (CreateBranch)
383 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
384 else
385 Builder.SetInsertPoint(Builder.GetInsertBlock());
386 // SetInsertPoint also updates the Builder's debug location, but we want to
387 // keep the one the Builder was configured to use.
388 Builder.SetCurrentDebugLocation(DebugLoc);
389 return New;
390}
391
393 llvm::Twine Suffix) {
394 BasicBlock *Old = Builder.GetInsertBlock();
395 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
396}
397
398// This function creates a fake integer value and a fake use for the integer
399// value. It returns the fake value created. This is useful in modeling the
400// extra arguments to the outlined functions.
402 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
404 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
405 const Twine &Name = "", bool AsPtr = true,
406 bool Is64Bit = false) {
407 Builder.restoreIP(OuterAllocaIP);
408 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
409 Instruction *FakeVal;
410 AllocaInst *FakeValAddr =
411 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
412 ToBeDeleted.push_back(FakeValAddr);
413
414 if (AsPtr) {
415 FakeVal = FakeValAddr;
416 } else {
417 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
418 ToBeDeleted.push_back(FakeVal);
419 }
420
421 // Generate a fake use of this value
422 Builder.restoreIP(InnerAllocaIP);
423 Instruction *UseFakeVal;
424 if (AsPtr) {
425 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
426 } else {
427 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
428 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
429 }
430 ToBeDeleted.push_back(UseFakeVal);
431 return FakeVal;
432}
433
434//===----------------------------------------------------------------------===//
435// OpenMPIRBuilderConfig
436//===----------------------------------------------------------------------===//
437
438namespace {
440/// Values for bit flags for marking which requires clauses have been used.
441enum OpenMPOffloadingRequiresDirFlags {
442 /// flag undefined.
443 OMP_REQ_UNDEFINED = 0x000,
444 /// no requires directive present.
445 OMP_REQ_NONE = 0x001,
446 /// reverse_offload clause.
447 OMP_REQ_REVERSE_OFFLOAD = 0x002,
448 /// unified_address clause.
449 OMP_REQ_UNIFIED_ADDRESS = 0x004,
450 /// unified_shared_memory clause.
451 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
452 /// dynamic_allocators clause.
453 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
454 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
455};
456
457} // anonymous namespace
458
460 : RequiresFlags(OMP_REQ_UNDEFINED) {}
461
464 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
465 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
468 RequiresFlags(OMP_REQ_UNDEFINED) {
469 if (HasRequiresReverseOffload)
470 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
471 if (HasRequiresUnifiedAddress)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 if (HasRequiresUnifiedSharedMemory)
474 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
475 if (HasRequiresDynamicAllocators)
476 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
477}
478
480 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
481}
482
484 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
485}
486
488 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
489}
490
492 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
493}
494
496 return hasRequiresFlags() ? RequiresFlags
497 : static_cast<int64_t>(OMP_REQ_NONE);
498}
499
501 if (Value)
502 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
503 else
504 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
505}
506
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
512}
513
515 if (Value)
516 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
517 else
518 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
519}
520
522 if (Value)
523 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
524 else
525 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
526}
527
528//===----------------------------------------------------------------------===//
529// OpenMPIRBuilder
530//===----------------------------------------------------------------------===//
531
534 SmallVector<Value *> &ArgsVector) {
536 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
537 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
538 constexpr size_t MaxDim = 3;
539 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
540
541 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
542
543 Value *DynCGroupMemFallbackFlag =
544 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
545 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
546 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
547
548 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
549
550 Value *NumTeams3D =
551 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
552 Value *NumThreads3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
554 for (unsigned I :
555 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
556 NumTeams3D =
557 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
558 for (unsigned I :
559 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
560 NumThreads3D =
561 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
562
563 ArgsVector = {Version,
564 PointerNum,
565 KernelArgs.RTArgs.BasePointersArray,
566 KernelArgs.RTArgs.PointersArray,
567 KernelArgs.RTArgs.SizesArray,
568 KernelArgs.RTArgs.MapTypesArray,
569 KernelArgs.RTArgs.MapNamesArray,
570 KernelArgs.RTArgs.MappersArray,
571 KernelArgs.NumIterations,
572 Flags,
573 NumTeams3D,
574 NumThreads3D,
575 KernelArgs.DynCGroupMem};
576}
577
579 LLVMContext &Ctx = Fn.getContext();
580
581 // Get the function's current attributes.
582 auto Attrs = Fn.getAttributes();
583 auto FnAttrs = Attrs.getFnAttrs();
584 auto RetAttrs = Attrs.getRetAttrs();
586 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
587 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
588
589 // Add AS to FnAS while taking special care with integer extensions.
590 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
591 bool Param = true) -> void {
592 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
593 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
594 if (HasSignExt || HasZeroExt) {
595 assert(AS.getNumAttributes() == 1 &&
596 "Currently not handling extension attr combined with others.");
597 if (Param) {
598 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
599 FnAS = FnAS.addAttribute(Ctx, AK);
600 } else if (auto AK =
601 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
602 FnAS = FnAS.addAttribute(Ctx, AK);
603 } else {
604 FnAS = FnAS.addAttributes(Ctx, AS);
605 }
606 };
607
608#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
609#include "llvm/Frontend/OpenMP/OMPKinds.def"
610
611 // Add attributes to the function declaration.
612 switch (FnID) {
613#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
614 case Enum: \
615 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
616 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
617 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
618 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
619 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
620 break;
621#include "llvm/Frontend/OpenMP/OMPKinds.def"
622 default:
623 // Attributes are optional.
624 break;
625 }
626}
627
630 FunctionType *FnTy = nullptr;
631 Function *Fn = nullptr;
632
633 // Try to find the declation in the module first.
634 switch (FnID) {
635#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
636 case Enum: \
637 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
638 IsVarArg); \
639 Fn = M.getFunction(Str); \
640 break;
641#include "llvm/Frontend/OpenMP/OMPKinds.def"
642 }
643
644 if (!Fn) {
645 // Create a new declaration if we need one.
646 switch (FnID) {
647#define OMP_RTL(Enum, Str, ...) \
648 case Enum: \
649 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
650 break;
651#include "llvm/Frontend/OpenMP/OMPKinds.def"
652 }
653 Fn->setCallingConv(Config.getRuntimeCC());
654 // Add information if the runtime function takes a callback function
655 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
656 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
657 LLVMContext &Ctx = Fn->getContext();
658 MDBuilder MDB(Ctx);
659 // Annotate the callback behavior of the runtime function:
660 // - The callback callee is argument number 2 (microtask).
661 // - The first two arguments of the callback callee are unknown (-1).
662 // - All variadic arguments to the runtime function are passed to the
663 // callback callee.
664 Fn->addMetadata(
665 LLVMContext::MD_callback,
667 2, {-1, -1}, /* VarArgsArePassed */ true)}));
668 }
669 }
670
671 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
672 << " with type " << *Fn->getFunctionType() << "\n");
673 addAttributes(FnID, *Fn);
674
675 } else {
676 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 }
679
680 assert(Fn && "Failed to create OpenMP runtime function");
681
682 return {FnTy, Fn};
683}
684
687 if (!FiniBB) {
688 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
690 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
691 Builder.SetInsertPoint(FiniBB);
692 // FiniCB adds the branch to the exit stub.
693 if (Error Err = FiniCB(Builder.saveIP()))
694 return Err;
695 }
696 return FiniBB;
697}
698
700 BasicBlock *OtherFiniBB) {
701 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
702 if (!FiniBB) {
703 FiniBB = OtherFiniBB;
704
705 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
706 if (Error Err = FiniCB(Builder.saveIP()))
707 return Err;
708
709 return Error::success();
710 }
711
712 // Move instructions from FiniBB to the start of OtherFiniBB.
713 auto EndIt = FiniBB->end();
714 if (FiniBB->size() >= 1)
715 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
716 EndIt = Prev;
717 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
718 EndIt);
719
720 FiniBB->replaceAllUsesWith(OtherFiniBB);
721 FiniBB->eraseFromParent();
722 FiniBB = OtherFiniBB;
723 return Error::success();
724}
725
728 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
729 assert(Fn && "Failed to create OpenMP runtime function pointer");
730 return Fn;
731}
732
735 StringRef Name) {
736 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
737 Call->setCallingConv(Config.getRuntimeCC());
738 return Call;
739}
740
741void OpenMPIRBuilder::initialize() { initializeTypes(M); }
742
745 BasicBlock &EntryBlock = Function->getEntryBlock();
746 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
747
748 // Loop over blocks looking for constant allocas, skipping the entry block
749 // as any allocas there are already in the desired location.
750 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
751 Block++) {
752 for (auto Inst = Block->getReverseIterator()->begin();
753 Inst != Block->getReverseIterator()->end();) {
755 Inst++;
757 continue;
758 AllocaInst->moveBeforePreserving(MoveLocInst);
759 } else {
760 Inst++;
761 }
762 }
763 }
764}
765
768
769 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
770 // TODO: For now, we support simple static allocations, we might need to
771 // move non-static ones as well. However, this will need further analysis to
772 // move the lenght arguments as well.
774 };
775
776 for (llvm::Instruction &Inst : Block)
778 if (ShouldHoistAlloca(*AllocaInst))
779 AllocasToMove.push_back(AllocaInst);
780
781 auto InsertPoint =
782 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
783
784 for (llvm::Instruction *AllocaInst : AllocasToMove)
786}
787
789 PostDominatorTree PostDomTree(*Func);
790 for (llvm::BasicBlock &BB : *Func)
791 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
793}
794
796 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
798 SmallVector<OutlineInfo, 16> DeferredOutlines;
799 for (OutlineInfo &OI : OutlineInfos) {
800 // Skip functions that have not finalized yet; may happen with nested
801 // function generation.
802 if (Fn && OI.getFunction() != Fn) {
803 DeferredOutlines.push_back(OI);
804 continue;
805 }
806
807 ParallelRegionBlockSet.clear();
808 Blocks.clear();
809 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
810
811 Function *OuterFn = OI.getFunction();
812 CodeExtractorAnalysisCache CEAC(*OuterFn);
813 // If we generate code for the target device, we need to allocate
814 // struct for aggregate params in the device default alloca address space.
815 // OpenMP runtime requires that the params of the extracted functions are
816 // passed as zero address space pointers. This flag ensures that
817 // CodeExtractor generates correct code for extracted functions
818 // which are used by OpenMP runtime.
819 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
820 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
821 /* AggregateArgs */ true,
822 /* BlockFrequencyInfo */ nullptr,
823 /* BranchProbabilityInfo */ nullptr,
824 /* AssumptionCache */ nullptr,
825 /* AllowVarArgs */ true,
826 /* AllowAlloca */ true,
827 /* AllocaBlock*/ OI.OuterAllocaBB,
828 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
829
830 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
831 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
832 << " Exit: " << OI.ExitBB->getName() << "\n");
833 assert(Extractor.isEligible() &&
834 "Expected OpenMP outlining to be possible!");
835
836 for (auto *V : OI.ExcludeArgsFromAggregate)
837 Extractor.excludeArgFromAggregate(V);
838
839 Function *OutlinedFn =
840 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
841
842 // Forward target-cpu, target-features attributes to the outlined function.
843 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
844 if (TargetCpuAttr.isStringAttribute())
845 OutlinedFn->addFnAttr(TargetCpuAttr);
846
847 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
848 if (TargetFeaturesAttr.isStringAttribute())
849 OutlinedFn->addFnAttr(TargetFeaturesAttr);
850
851 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
852 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
853 assert(OutlinedFn->getReturnType()->isVoidTy() &&
854 "OpenMP outlined functions should not return a value!");
855
856 // For compability with the clang CG we move the outlined function after the
857 // one with the parallel region.
858 OutlinedFn->removeFromParent();
859 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
860
861 // Remove the artificial entry introduced by the extractor right away, we
862 // made our own entry block after all.
863 {
864 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
865 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
866 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
867 // Move instructions from the to-be-deleted ArtificialEntry to the entry
868 // basic block of the parallel region. CodeExtractor generates
869 // instructions to unwrap the aggregate argument and may sink
870 // allocas/bitcasts for values that are solely used in the outlined region
871 // and do not escape.
872 assert(!ArtificialEntry.empty() &&
873 "Expected instructions to add in the outlined region entry");
874 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
875 End = ArtificialEntry.rend();
876 It != End;) {
877 Instruction &I = *It;
878 It++;
879
880 if (I.isTerminator()) {
881 // Absorb any debug value that terminator may have
882 if (OI.EntryBB->getTerminator())
883 OI.EntryBB->getTerminator()->adoptDbgRecords(
884 &ArtificialEntry, I.getIterator(), false);
885 continue;
886 }
887
888 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
889 }
890
891 OI.EntryBB->moveBefore(&ArtificialEntry);
892 ArtificialEntry.eraseFromParent();
893 }
894 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
895 assert(OutlinedFn && OutlinedFn->hasNUses(1));
896
897 // Run a user callback, e.g. to add attributes.
898 if (OI.PostOutlineCB)
899 OI.PostOutlineCB(*OutlinedFn);
900
901 if (OI.FixUpNonEntryAllocas)
903 }
904
905 // Remove work items that have been completed.
906 OutlineInfos = std::move(DeferredOutlines);
907
908 // The createTarget functions embeds user written code into
909 // the target region which may inject allocas which need to
910 // be moved to the entry block of our target or risk malformed
911 // optimisations by later passes, this is only relevant for
912 // the device pass which appears to be a little more delicate
913 // when it comes to optimisations (however, we do not block on
914 // that here, it's up to the inserter to the list to do so).
915 // This notbaly has to occur after the OutlinedInfo candidates
916 // have been extracted so we have an end product that will not
917 // be implicitly adversely affected by any raises unless
918 // intentionally appended to the list.
919 // NOTE: This only does so for ConstantData, it could be extended
920 // to ConstantExpr's with further effort, however, they should
921 // largely be folded when they get here. Extending it to runtime
922 // defined/read+writeable allocation sizes would be non-trivial
923 // (need to factor in movement of any stores to variables the
924 // allocation size depends on, as well as the usual loads,
925 // otherwise it'll yield the wrong result after movement) and
926 // likely be more suitable as an LLVM optimisation pass.
929
930 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
931 [](EmitMetadataErrorKind Kind,
932 const TargetRegionEntryInfo &EntryInfo) -> void {
933 errs() << "Error of kind: " << Kind
934 << " when emitting offload entries and metadata during "
935 "OMPIRBuilder finalization \n";
936 };
937
938 if (!OffloadInfoManager.empty())
940
941 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
942 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
943 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
944 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
945 }
946
947 IsFinalized = true;
948}
949
950bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
951
953 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
954}
955
957 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
958 auto *GV =
959 new GlobalVariable(M, I32Ty,
960 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
961 ConstantInt::get(I32Ty, Value), Name);
962 GV->setVisibility(GlobalValue::HiddenVisibility);
963
964 return GV;
965}
966
968 if (List.empty())
969 return;
970
971 // Convert List to what ConstantArray needs.
973 UsedArray.resize(List.size());
974 for (unsigned I = 0, E = List.size(); I != E; ++I)
976 cast<Constant>(&*List[I]), Builder.getPtrTy());
977
978 if (UsedArray.empty())
979 return;
980 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
981
982 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
983 ConstantArray::get(ATy, UsedArray), Name);
984
985 GV->setSection("llvm.metadata");
986}
987
990 OMPTgtExecModeFlags Mode) {
991 auto *Int8Ty = Builder.getInt8Ty();
992 auto *GVMode = new GlobalVariable(
993 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
994 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
995 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
996 return GVMode;
997}
998
1000 uint32_t SrcLocStrSize,
1001 IdentFlag LocFlags,
1002 unsigned Reserve2Flags) {
1003 // Enable "C-mode".
1004 LocFlags |= OMP_IDENT_FLAG_KMPC;
1005
1006 Constant *&Ident =
1007 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1008 if (!Ident) {
1009 Constant *I32Null = ConstantInt::getNullValue(Int32);
1010 Constant *IdentData[] = {I32Null,
1011 ConstantInt::get(Int32, uint32_t(LocFlags)),
1012 ConstantInt::get(Int32, Reserve2Flags),
1013 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1014
1015 size_t SrcLocStrArgIdx = 4;
1016 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1018 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1019 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1020 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1021 Constant *Initializer =
1022 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1023
1024 // Look for existing encoding of the location + flags, not needed but
1025 // minimizes the difference to the existing solution while we transition.
1026 for (GlobalVariable &GV : M.globals())
1027 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1028 if (GV.getInitializer() == Initializer)
1029 Ident = &GV;
1030
1031 if (!Ident) {
1032 auto *GV = new GlobalVariable(
1033 M, OpenMPIRBuilder::Ident,
1034 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1036 M.getDataLayout().getDefaultGlobalsAddressSpace());
1037 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1038 GV->setAlignment(Align(8));
1039 Ident = GV;
1040 }
1041 }
1042
1043 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1044}
1045
1047 uint32_t &SrcLocStrSize) {
1048 SrcLocStrSize = LocStr.size();
1049 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1050 if (!SrcLocStr) {
1051 Constant *Initializer =
1052 ConstantDataArray::getString(M.getContext(), LocStr);
1053
1054 // Look for existing encoding of the location, not needed but minimizes the
1055 // difference to the existing solution while we transition.
1056 for (GlobalVariable &GV : M.globals())
1057 if (GV.isConstant() && GV.hasInitializer() &&
1058 GV.getInitializer() == Initializer)
1059 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1060
1061 SrcLocStr = Builder.CreateGlobalString(
1062 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1063 &M);
1064 }
1065 return SrcLocStr;
1066}
1067
1069 StringRef FileName,
1070 unsigned Line, unsigned Column,
1071 uint32_t &SrcLocStrSize) {
1072 SmallString<128> Buffer;
1073 Buffer.push_back(';');
1074 Buffer.append(FileName);
1075 Buffer.push_back(';');
1076 Buffer.append(FunctionName);
1077 Buffer.push_back(';');
1078 Buffer.append(std::to_string(Line));
1079 Buffer.push_back(';');
1080 Buffer.append(std::to_string(Column));
1081 Buffer.push_back(';');
1082 Buffer.push_back(';');
1083 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1084}
1085
1086Constant *
1088 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1089 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1090}
1091
1093 uint32_t &SrcLocStrSize,
1094 Function *F) {
1095 DILocation *DIL = DL.get();
1096 if (!DIL)
1097 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1098 StringRef FileName = M.getName();
1099 if (DIFile *DIF = DIL->getFile())
1100 if (std::optional<StringRef> Source = DIF->getSource())
1101 FileName = *Source;
1102 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1103 if (Function.empty() && F)
1104 Function = F->getName();
1105 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1106 DIL->getColumn(), SrcLocStrSize);
1107}
1108
1110 uint32_t &SrcLocStrSize) {
1111 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1112 Loc.IP.getBlock()->getParent());
1113}
1114
1117 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1118 "omp_global_thread_num");
1119}
1120
1123 bool ForceSimpleCall, bool CheckCancelFlag) {
1124 if (!updateToLocation(Loc))
1125 return Loc.IP;
1126
1127 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1128 // __kmpc_barrier(loc, thread_id);
1129
1130 IdentFlag BarrierLocFlags;
1131 switch (Kind) {
1132 case OMPD_for:
1133 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1134 break;
1135 case OMPD_sections:
1136 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1137 break;
1138 case OMPD_single:
1139 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1140 break;
1141 case OMPD_barrier:
1142 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1143 break;
1144 default:
1145 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1146 break;
1147 }
1148
1149 uint32_t SrcLocStrSize;
1150 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1151 Value *Args[] = {
1152 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1153 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1154
1155 // If we are in a cancellable parallel region, barriers are cancellation
1156 // points.
1157 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1158 bool UseCancelBarrier =
1159 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1160
1162 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1163 ? OMPRTL___kmpc_cancel_barrier
1164 : OMPRTL___kmpc_barrier),
1165 Args);
1166
1167 if (UseCancelBarrier && CheckCancelFlag)
1168 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1169 return Err;
1170
1171 return Builder.saveIP();
1172}
1173
1176 Value *IfCondition,
1177 omp::Directive CanceledDirective) {
1178 if (!updateToLocation(Loc))
1179 return Loc.IP;
1180
1181 // LLVM utilities like blocks with terminators.
1182 auto *UI = Builder.CreateUnreachable();
1183
1184 Instruction *ThenTI = UI, *ElseTI = nullptr;
1185 if (IfCondition) {
1186 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1187
1188 // Even if the if condition evaluates to false, this should count as a
1189 // cancellation point
1190 Builder.SetInsertPoint(ElseTI);
1191 auto ElseIP = Builder.saveIP();
1192
1194 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1195 if (!IPOrErr)
1196 return IPOrErr;
1197 }
1198
1199 Builder.SetInsertPoint(ThenTI);
1200
1201 Value *CancelKind = nullptr;
1202 switch (CanceledDirective) {
1203#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1204 case DirectiveEnum: \
1205 CancelKind = Builder.getInt32(Value); \
1206 break;
1207#include "llvm/Frontend/OpenMP/OMPKinds.def"
1208 default:
1209 llvm_unreachable("Unknown cancel kind!");
1210 }
1211
1212 uint32_t SrcLocStrSize;
1213 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1214 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1215 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1217 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1218
1219 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1220 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1221 return Err;
1222
1223 // Update the insertion point and remove the terminator we introduced.
1224 Builder.SetInsertPoint(UI->getParent());
1225 UI->eraseFromParent();
1226
1227 return Builder.saveIP();
1228}
1229
1232 omp::Directive CanceledDirective) {
1233 if (!updateToLocation(Loc))
1234 return Loc.IP;
1235
1236 // LLVM utilities like blocks with terminators.
1237 auto *UI = Builder.CreateUnreachable();
1238 Builder.SetInsertPoint(UI);
1239
1240 Value *CancelKind = nullptr;
1241 switch (CanceledDirective) {
1242#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1243 case DirectiveEnum: \
1244 CancelKind = Builder.getInt32(Value); \
1245 break;
1246#include "llvm/Frontend/OpenMP/OMPKinds.def"
1247 default:
1248 llvm_unreachable("Unknown cancel kind!");
1249 }
1250
1251 uint32_t SrcLocStrSize;
1252 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1253 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1254 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1256 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1257
1258 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1259 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1260 return Err;
1261
1262 // Update the insertion point and remove the terminator we introduced.
1263 Builder.SetInsertPoint(UI->getParent());
1264 UI->eraseFromParent();
1265
1266 return Builder.saveIP();
1267}
1268
1270 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1271 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1272 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1273 if (!updateToLocation(Loc))
1274 return Loc.IP;
1275
1276 Builder.restoreIP(AllocaIP);
1277 auto *KernelArgsPtr =
1278 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1280
1281 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1282 llvm::Value *Arg =
1283 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1284 Builder.CreateAlignedStore(
1285 KernelArgs[I], Arg,
1286 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1287 }
1288
1289 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1290 NumThreads, HostPtr, KernelArgsPtr};
1291
1293 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1294 OffloadingArgs);
1295
1296 return Builder.saveIP();
1297}
1298
1300 const LocationDescription &Loc, Value *OutlinedFnID,
1301 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1302 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1303
1304 if (!updateToLocation(Loc))
1305 return Loc.IP;
1306
1307 // On top of the arrays that were filled up, the target offloading call
1308 // takes as arguments the device id as well as the host pointer. The host
1309 // pointer is used by the runtime library to identify the current target
1310 // region, so it only has to be unique and not necessarily point to
1311 // anything. It could be the pointer to the outlined function that
1312 // implements the target region, but we aren't using that so that the
1313 // compiler doesn't need to keep that, and could therefore inline the host
1314 // function if proven worthwhile during optimization.
1315
1316 // From this point on, we need to have an ID of the target region defined.
1317 assert(OutlinedFnID && "Invalid outlined function ID!");
1318 (void)OutlinedFnID;
1319
1320 // Return value of the runtime offloading call.
1321 Value *Return = nullptr;
1322
1323 // Arguments for the target kernel.
1324 SmallVector<Value *> ArgsVector;
1325 getKernelArgsVector(Args, Builder, ArgsVector);
1326
1327 // The target region is an outlined function launched by the runtime
1328 // via calls to __tgt_target_kernel().
1329 //
1330 // Note that on the host and CPU targets, the runtime implementation of
1331 // these calls simply call the outlined function without forking threads.
1332 // The outlined functions themselves have runtime calls to
1333 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1334 // the compiler in emitTeamsCall() and emitParallelCall().
1335 //
1336 // In contrast, on the NVPTX target, the implementation of
1337 // __tgt_target_teams() launches a GPU kernel with the requested number
1338 // of teams and threads so no additional calls to the runtime are required.
1339 // Check the error code and execute the host version if required.
1340 Builder.restoreIP(emitTargetKernel(
1341 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1342 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1343
1344 BasicBlock *OffloadFailedBlock =
1345 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1346 BasicBlock *OffloadContBlock =
1347 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1348 Value *Failed = Builder.CreateIsNotNull(Return);
1349 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1350
1351 auto CurFn = Builder.GetInsertBlock()->getParent();
1352 emitBlock(OffloadFailedBlock, CurFn);
1353 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1354 if (!AfterIP)
1355 return AfterIP.takeError();
1356 Builder.restoreIP(*AfterIP);
1357 emitBranch(OffloadContBlock);
1358 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1359 return Builder.saveIP();
1360}
1361
1363 Value *CancelFlag, omp::Directive CanceledDirective) {
1364 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1365 "Unexpected cancellation!");
1366
1367 // For a cancel barrier we create two new blocks.
1368 BasicBlock *BB = Builder.GetInsertBlock();
1369 BasicBlock *NonCancellationBlock;
1370 if (Builder.GetInsertPoint() == BB->end()) {
1371 // TODO: This branch will not be needed once we moved to the
1372 // OpenMPIRBuilder codegen completely.
1373 NonCancellationBlock = BasicBlock::Create(
1374 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1375 } else {
1376 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1378 Builder.SetInsertPoint(BB);
1379 }
1380 BasicBlock *CancellationBlock = BasicBlock::Create(
1381 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1382
1383 // Jump to them based on the return value.
1384 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1385 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1386 /* TODO weight */ nullptr, nullptr);
1387
1388 // From the cancellation block we finalize all variables and go to the
1389 // post finalization block that is known to the FiniCB callback.
1390 auto &FI = FinalizationStack.back();
1391 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1392 if (!FiniBBOrErr)
1393 return FiniBBOrErr.takeError();
1394 Builder.SetInsertPoint(CancellationBlock);
1395 Builder.CreateBr(*FiniBBOrErr);
1396
1397 // The continuation block is where code generation continues.
1398 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1399 return Error::success();
1400}
1401
1402// Callback used to create OpenMP runtime calls to support
1403// omp parallel clause for the device.
1404// We need to use this callback to replace call to the OutlinedFn in OuterFn
1405// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1407 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1408 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1409 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1410 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1411 // Add some known attributes.
1412 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1413 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1414 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1415 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1416 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1417 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1418
1419 assert(OutlinedFn.arg_size() >= 2 &&
1420 "Expected at least tid and bounded tid as arguments");
1421 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1422
1423 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1424 assert(CI && "Expected call instruction to outlined function");
1425 CI->getParent()->setName("omp_parallel");
1426
1427 Builder.SetInsertPoint(CI);
1428 Type *PtrTy = OMPIRBuilder->VoidPtr;
1429 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1430
1431 // Add alloca for kernel args
1432 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1433 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1434 AllocaInst *ArgsAlloca =
1435 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1436 Value *Args = ArgsAlloca;
1437 // Add address space cast if array for storing arguments is not allocated
1438 // in address space 0
1439 if (ArgsAlloca->getAddressSpace())
1440 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1441 Builder.restoreIP(CurrentIP);
1442
1443 // Store captured vars which are used by kmpc_parallel_60
1444 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1445 Value *V = *(CI->arg_begin() + 2 + Idx);
1446 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1447 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1448 Builder.CreateStore(V, StoreAddress);
1449 }
1450
1451 Value *Cond =
1452 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1453 : Builder.getInt32(1);
1454
1455 // Build kmpc_parallel_60 call
1456 Value *Parallel60CallArgs[] = {
1457 /* identifier*/ Ident,
1458 /* global thread num*/ ThreadID,
1459 /* if expression */ Cond,
1460 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1461 /* Proc bind */ Builder.getInt32(-1),
1462 /* outlined function */ &OutlinedFn,
1463 /* wrapper function */ NullPtrValue,
1464 /* arguments of the outlined funciton*/ Args,
1465 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1466 /* strict for number of threads */ Builder.getInt32(0)};
1467
1468 FunctionCallee RTLFn =
1469 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1470
1471 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1472
1473 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1474 << *Builder.GetInsertBlock()->getParent() << "\n");
1475
1476 // Initialize the local TID stack location with the argument value.
1477 Builder.SetInsertPoint(PrivTID);
1478 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1479 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1480 PrivTIDAddr);
1481
1482 // Remove redundant call to the outlined function.
1483 CI->eraseFromParent();
1484
1485 for (Instruction *I : ToBeDeleted) {
1486 I->eraseFromParent();
1487 }
1488}
1489
1490// Callback used to create OpenMP runtime calls to support
1491// omp parallel clause for the host.
1492// We need to use this callback to replace call to the OutlinedFn in OuterFn
1493// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1494static void
1496 Function *OuterFn, Value *Ident, Value *IfCondition,
1497 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1498 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1499 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1500 FunctionCallee RTLFn;
1501 if (IfCondition) {
1502 RTLFn =
1503 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1504 } else {
1505 RTLFn =
1506 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1507 }
1508 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1509 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1510 LLVMContext &Ctx = F->getContext();
1511 MDBuilder MDB(Ctx);
1512 // Annotate the callback behavior of the __kmpc_fork_call:
1513 // - The callback callee is argument number 2 (microtask).
1514 // - The first two arguments of the callback callee are unknown (-1).
1515 // - All variadic arguments to the __kmpc_fork_call are passed to the
1516 // callback callee.
1517 F->addMetadata(LLVMContext::MD_callback,
1519 2, {-1, -1},
1520 /* VarArgsArePassed */ true)}));
1521 }
1522 }
1523 // Add some known attributes.
1524 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1525 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1526 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1527
1528 assert(OutlinedFn.arg_size() >= 2 &&
1529 "Expected at least tid and bounded tid as arguments");
1530 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1531
1532 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1533 CI->getParent()->setName("omp_parallel");
1534 Builder.SetInsertPoint(CI);
1535
1536 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1537 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1538 &OutlinedFn};
1539
1540 SmallVector<Value *, 16> RealArgs;
1541 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1542 if (IfCondition) {
1543 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1544 RealArgs.push_back(Cond);
1545 }
1546 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1547
1548 // __kmpc_fork_call_if always expects a void ptr as the last argument
1549 // If there are no arguments, pass a null pointer.
1550 auto PtrTy = OMPIRBuilder->VoidPtr;
1551 if (IfCondition && NumCapturedVars == 0) {
1552 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1553 RealArgs.push_back(NullPtrValue);
1554 }
1555
1556 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1557
1558 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1559 << *Builder.GetInsertBlock()->getParent() << "\n");
1560
1561 // Initialize the local TID stack location with the argument value.
1562 Builder.SetInsertPoint(PrivTID);
1563 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1564 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1565 PrivTIDAddr);
1566
1567 // Remove redundant call to the outlined function.
1568 CI->eraseFromParent();
1569
1570 for (Instruction *I : ToBeDeleted) {
1571 I->eraseFromParent();
1572 }
1573}
1574
1576 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1577 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1578 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1579 omp::ProcBindKind ProcBind, bool IsCancellable) {
1580 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1581
1582 if (!updateToLocation(Loc))
1583 return Loc.IP;
1584
1585 uint32_t SrcLocStrSize;
1586 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1587 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1588 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1589 (ProcBind != OMP_PROC_BIND_default);
1590 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1591 // If we generate code for the target device, we need to allocate
1592 // struct for aggregate params in the device default alloca address space.
1593 // OpenMP runtime requires that the params of the extracted functions are
1594 // passed as zero address space pointers. This flag ensures that extracted
1595 // function arguments are declared in zero address space
1596 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1597
1598 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1599 // only if we compile for host side.
1600 if (NumThreads && !Config.isTargetDevice()) {
1601 Value *Args[] = {
1602 Ident, ThreadID,
1603 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1605 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1606 }
1607
1608 if (ProcBind != OMP_PROC_BIND_default) {
1609 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1610 Value *Args[] = {
1611 Ident, ThreadID,
1612 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1614 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1615 }
1616
1617 BasicBlock *InsertBB = Builder.GetInsertBlock();
1618 Function *OuterFn = InsertBB->getParent();
1619
1620 // Save the outer alloca block because the insertion iterator may get
1621 // invalidated and we still need this later.
1622 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1623
1624 // Vector to remember instructions we used only during the modeling but which
1625 // we want to delete at the end.
1627
1628 // Change the location to the outer alloca insertion point to create and
1629 // initialize the allocas we pass into the parallel region.
1630 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1631 Builder.restoreIP(NewOuter);
1632 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1633 AllocaInst *ZeroAddrAlloca =
1634 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1635 Instruction *TIDAddr = TIDAddrAlloca;
1636 Instruction *ZeroAddr = ZeroAddrAlloca;
1637 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1638 // Add additional casts to enforce pointers in zero address space
1639 TIDAddr = new AddrSpaceCastInst(
1640 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1641 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1642 ToBeDeleted.push_back(TIDAddr);
1643 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1644 PointerType ::get(M.getContext(), 0),
1645 "zero.addr.ascast");
1646 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1647 ToBeDeleted.push_back(ZeroAddr);
1648 }
1649
1650 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1651 // associated arguments in the outlined function, so we delete them later.
1652 ToBeDeleted.push_back(TIDAddrAlloca);
1653 ToBeDeleted.push_back(ZeroAddrAlloca);
1654
1655 // Create an artificial insertion point that will also ensure the blocks we
1656 // are about to split are not degenerated.
1657 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1658
1659 BasicBlock *EntryBB = UI->getParent();
1660 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1661 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1662 BasicBlock *PRegPreFiniBB =
1663 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1664 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1665
1666 auto FiniCBWrapper = [&](InsertPointTy IP) {
1667 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1668 // target to the region exit block.
1669 if (IP.getBlock()->end() == IP.getPoint()) {
1671 Builder.restoreIP(IP);
1672 Instruction *I = Builder.CreateBr(PRegExitBB);
1673 IP = InsertPointTy(I->getParent(), I->getIterator());
1674 }
1675 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1676 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1677 "Unexpected insertion point for finalization call!");
1678 return FiniCB(IP);
1679 };
1680
1681 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1682
1683 // Generate the privatization allocas in the block that will become the entry
1684 // of the outlined function.
1685 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1686 InsertPointTy InnerAllocaIP = Builder.saveIP();
1687
1688 AllocaInst *PrivTIDAddr =
1689 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1690 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1691
1692 // Add some fake uses for OpenMP provided arguments.
1693 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1694 Instruction *ZeroAddrUse =
1695 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1696 ToBeDeleted.push_back(ZeroAddrUse);
1697
1698 // EntryBB
1699 // |
1700 // V
1701 // PRegionEntryBB <- Privatization allocas are placed here.
1702 // |
1703 // V
1704 // PRegionBodyBB <- BodeGen is invoked here.
1705 // |
1706 // V
1707 // PRegPreFiniBB <- The block we will start finalization from.
1708 // |
1709 // V
1710 // PRegionExitBB <- A common exit to simplify block collection.
1711 //
1712
1713 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1714
1715 // Let the caller create the body.
1716 assert(BodyGenCB && "Expected body generation callback!");
1717 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1718 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1719 return Err;
1720
1721 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1722
1723 OutlineInfo OI;
1724 if (Config.isTargetDevice()) {
1725 // Generate OpenMP target specific runtime call
1726 OI.PostOutlineCB = [=, ToBeDeletedVec =
1727 std::move(ToBeDeleted)](Function &OutlinedFn) {
1728 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1729 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1730 ThreadID, ToBeDeletedVec);
1731 };
1732 OI.FixUpNonEntryAllocas = true;
1733 } else {
1734 // Generate OpenMP host runtime call
1735 OI.PostOutlineCB = [=, ToBeDeletedVec =
1736 std::move(ToBeDeleted)](Function &OutlinedFn) {
1737 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1738 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1739 };
1740 OI.FixUpNonEntryAllocas = true;
1741 }
1742
1743 OI.OuterAllocaBB = OuterAllocaBlock;
1744 OI.EntryBB = PRegEntryBB;
1745 OI.ExitBB = PRegExitBB;
1746
1747 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1749 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1750
1751 CodeExtractorAnalysisCache CEAC(*OuterFn);
1752 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1753 /* AggregateArgs */ false,
1754 /* BlockFrequencyInfo */ nullptr,
1755 /* BranchProbabilityInfo */ nullptr,
1756 /* AssumptionCache */ nullptr,
1757 /* AllowVarArgs */ true,
1758 /* AllowAlloca */ true,
1759 /* AllocationBlock */ OuterAllocaBlock,
1760 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1761
1762 // Find inputs to, outputs from the code region.
1763 BasicBlock *CommonExit = nullptr;
1764 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1765 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1766
1767 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1768 /*CollectGlobalInputs=*/true);
1769
1770 Inputs.remove_if([&](Value *I) {
1772 return GV->getValueType() == OpenMPIRBuilder::Ident;
1773
1774 return false;
1775 });
1776
1777 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1778
1779 FunctionCallee TIDRTLFn =
1780 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1781
1782 auto PrivHelper = [&](Value &V) -> Error {
1783 if (&V == TIDAddr || &V == ZeroAddr) {
1785 return Error::success();
1786 }
1787
1789 for (Use &U : V.uses())
1790 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1791 if (ParallelRegionBlockSet.count(UserI->getParent()))
1792 Uses.insert(&U);
1793
1794 // __kmpc_fork_call expects extra arguments as pointers. If the input
1795 // already has a pointer type, everything is fine. Otherwise, store the
1796 // value onto stack and load it back inside the to-be-outlined region. This
1797 // will ensure only the pointer will be passed to the function.
1798 // FIXME: if there are more than 15 trailing arguments, they must be
1799 // additionally packed in a struct.
1800 Value *Inner = &V;
1801 if (!V.getType()->isPointerTy()) {
1803 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1804
1805 Builder.restoreIP(OuterAllocaIP);
1806 Value *Ptr =
1807 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1808
1809 // Store to stack at end of the block that currently branches to the entry
1810 // block of the to-be-outlined region.
1811 Builder.SetInsertPoint(InsertBB,
1812 InsertBB->getTerminator()->getIterator());
1813 Builder.CreateStore(&V, Ptr);
1814
1815 // Load back next to allocations in the to-be-outlined region.
1816 Builder.restoreIP(InnerAllocaIP);
1817 Inner = Builder.CreateLoad(V.getType(), Ptr);
1818 }
1819
1820 Value *ReplacementValue = nullptr;
1821 CallInst *CI = dyn_cast<CallInst>(&V);
1822 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1823 ReplacementValue = PrivTID;
1824 } else {
1825 InsertPointOrErrorTy AfterIP =
1826 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1827 if (!AfterIP)
1828 return AfterIP.takeError();
1829 Builder.restoreIP(*AfterIP);
1830 InnerAllocaIP = {
1831 InnerAllocaIP.getBlock(),
1832 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1833
1834 assert(ReplacementValue &&
1835 "Expected copy/create callback to set replacement value!");
1836 if (ReplacementValue == &V)
1837 return Error::success();
1838 }
1839
1840 for (Use *UPtr : Uses)
1841 UPtr->set(ReplacementValue);
1842
1843 return Error::success();
1844 };
1845
1846 // Reset the inner alloca insertion as it will be used for loading the values
1847 // wrapped into pointers before passing them into the to-be-outlined region.
1848 // Configure it to insert immediately after the fake use of zero address so
1849 // that they are available in the generated body and so that the
1850 // OpenMP-related values (thread ID and zero address pointers) remain leading
1851 // in the argument list.
1852 InnerAllocaIP = IRBuilder<>::InsertPoint(
1853 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1854
1855 // Reset the outer alloca insertion point to the entry of the relevant block
1856 // in case it was invalidated.
1857 OuterAllocaIP = IRBuilder<>::InsertPoint(
1858 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1859
1860 for (Value *Input : Inputs) {
1861 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1862 if (Error Err = PrivHelper(*Input))
1863 return Err;
1864 }
1865 LLVM_DEBUG({
1866 for (Value *Output : Outputs)
1867 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1868 });
1869 assert(Outputs.empty() &&
1870 "OpenMP outlining should not produce live-out values!");
1871
1872 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1873 LLVM_DEBUG({
1874 for (auto *BB : Blocks)
1875 dbgs() << " PBR: " << BB->getName() << "\n";
1876 });
1877
1878 // Adjust the finalization stack, verify the adjustment, and call the
1879 // finalize function a last time to finalize values between the pre-fini
1880 // block and the exit block if we left the parallel "the normal way".
1881 auto FiniInfo = FinalizationStack.pop_back_val();
1882 (void)FiniInfo;
1883 assert(FiniInfo.DK == OMPD_parallel &&
1884 "Unexpected finalization stack state!");
1885
1886 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1887
1888 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1889 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1890 if (!FiniBBOrErr)
1891 return FiniBBOrErr.takeError();
1892 {
1894 Builder.restoreIP(PreFiniIP);
1895 Builder.CreateBr(*FiniBBOrErr);
1896 // There's currently a branch to omp.par.exit. Delete it. We will get there
1897 // via the fini block
1898 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1899 Term->eraseFromParent();
1900 }
1901
1902 // Register the outlined info.
1903 addOutlineInfo(std::move(OI));
1904
1905 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1906 UI->eraseFromParent();
1907
1908 return AfterIP;
1909}
1910
1912 // Build call void __kmpc_flush(ident_t *loc)
1913 uint32_t SrcLocStrSize;
1914 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1915 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1916
1918 Args);
1919}
1920
1922 if (!updateToLocation(Loc))
1923 return;
1924 emitFlush(Loc);
1925}
1926
1928 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1929 // global_tid);
1930 uint32_t SrcLocStrSize;
1931 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1932 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1933 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1934
1935 // Ignore return result until untied tasks are supported.
1937 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1938}
1939
1945
1947 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1948 uint32_t SrcLocStrSize;
1949 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1950 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1951 Constant *I32Null = ConstantInt::getNullValue(Int32);
1952 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1953
1955 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1956}
1957
1963
1964// Processes the dependencies in Dependencies and does the following
1965// - Allocates space on the stack of an array of DependInfo objects
1966// - Populates each DependInfo object with relevant information of
1967// the corresponding dependence.
1968// - All code is inserted in the entry block of the current function.
1970 OpenMPIRBuilder &OMPBuilder,
1972 // Early return if we have no dependencies to process
1973 if (Dependencies.empty())
1974 return nullptr;
1975
1976 // Given a vector of DependData objects, in this function we create an
1977 // array on the stack that holds kmp_dep_info objects corresponding
1978 // to each dependency. This is then passed to the OpenMP runtime.
1979 // For example, if there are 'n' dependencies then the following psedo
1980 // code is generated. Assume the first dependence is on a variable 'a'
1981 //
1982 // \code{c}
1983 // DepArray = alloc(n x sizeof(kmp_depend_info);
1984 // idx = 0;
1985 // DepArray[idx].base_addr = ptrtoint(&a);
1986 // DepArray[idx].len = 8;
1987 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1988 // ++idx;
1989 // DepArray[idx].base_addr = ...;
1990 // \endcode
1991
1992 IRBuilderBase &Builder = OMPBuilder.Builder;
1993 Type *DependInfo = OMPBuilder.DependInfo;
1994 Module &M = OMPBuilder.M;
1995
1996 Value *DepArray = nullptr;
1997 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1998 Builder.SetInsertPoint(
2000
2001 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2002 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2003
2004 Builder.restoreIP(OldIP);
2005
2006 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2007 Value *Base =
2008 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2009 // Store the pointer to the variable
2010 Value *Addr = Builder.CreateStructGEP(
2011 DependInfo, Base,
2012 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2013 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2014 Builder.CreateStore(DepValPtr, Addr);
2015 // Store the size of the variable
2016 Value *Size = Builder.CreateStructGEP(
2017 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2018 Builder.CreateStore(
2019 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2020 Size);
2021 // Store the dependency kind
2022 Value *Flags = Builder.CreateStructGEP(
2023 DependInfo, Base,
2024 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2025 Builder.CreateStore(
2026 ConstantInt::get(Builder.getInt8Ty(),
2027 static_cast<unsigned int>(Dep.DepKind)),
2028 Flags);
2029 }
2030 return DepArray;
2031}
2032
2033/// Create the task duplication function passed to kmpc_taskloop.
2034Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2035 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2036 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2037 if (!DupCB)
2039 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2040
2041 // From OpenMP Runtime p_task_dup_t:
2042 // Routine optionally generated by the compiler for setting the lastprivate
2043 // flag and calling needed constructors for private/firstprivate objects (used
2044 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2045 // lastprivate flag.
2046 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2047
2048 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2049
2050 FunctionType *DupFuncTy = FunctionType::get(
2051 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2052 /*isVarArg=*/false);
2053
2054 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2055 "omp_taskloop_dup", M);
2056 Value *DestTaskArg = DupFunction->getArg(0);
2057 Value *SrcTaskArg = DupFunction->getArg(1);
2058 Value *LastprivateFlagArg = DupFunction->getArg(2);
2059 DestTaskArg->setName("dest_task");
2060 SrcTaskArg->setName("src_task");
2061 LastprivateFlagArg->setName("lastprivate_flag");
2062
2063 IRBuilderBase::InsertPointGuard Guard(Builder);
2064 Builder.SetInsertPoint(
2065 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2066
2067 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2068 Type *TaskWithPrivatesTy =
2069 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2070 Value *TaskPrivates = Builder.CreateGEP(
2071 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2072 Value *ContextPtr = Builder.CreateGEP(
2073 PrivatesTy, TaskPrivates,
2074 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2075 return ContextPtr;
2076 };
2077
2078 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2079 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2080
2081 DestTaskContextPtr->setName("destPtr");
2082 SrcTaskContextPtr->setName("srcPtr");
2083
2084 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2085 DupFunction->getEntryBlock().begin());
2086 InsertPointTy CodeGenIP = Builder.saveIP();
2087 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2088 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2089 if (!AfterIPOrError)
2090 return AfterIPOrError.takeError();
2091 Builder.restoreIP(*AfterIPOrError);
2092
2093 Builder.CreateRetVoid();
2094
2095 return DupFunction;
2096}
2097
2098OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2099 const LocationDescription &Loc, InsertPointTy AllocaIP,
2100 BodyGenCallbackTy BodyGenCB,
2101 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2102 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2103 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2104 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2105 Value *TaskContextStructPtrVal) {
2106
2107 if (!updateToLocation(Loc))
2108 return InsertPointTy();
2109
2110 uint32_t SrcLocStrSize;
2111 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2112 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2113
2114 BasicBlock *TaskloopExitBB =
2115 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2116 BasicBlock *TaskloopBodyBB =
2117 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2118 BasicBlock *TaskloopAllocaBB =
2119 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2120
2121 InsertPointTy TaskloopAllocaIP =
2122 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2123 InsertPointTy TaskloopBodyIP =
2124 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2125
2126 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2127 return Err;
2128
2129 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2130 if (!result) {
2131 return result.takeError();
2132 }
2133
2134 llvm::CanonicalLoopInfo *CLI = result.get();
2135 OutlineInfo OI;
2136 OI.EntryBB = TaskloopAllocaBB;
2137 OI.OuterAllocaBB = AllocaIP.getBlock();
2138 OI.ExitBB = TaskloopExitBB;
2139
2140 // Add the thread ID argument.
2141 SmallVector<Instruction *> ToBeDeleted;
2142 // dummy instruction to be used as a fake argument
2143 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2144 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2145 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2146 TaskloopAllocaIP, "lb", false, true);
2147 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2148 TaskloopAllocaIP, "ub", false, true);
2149 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2150 TaskloopAllocaIP, "step", false, true);
2151 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2152 // aggregate struct
2153 OI.Inputs.insert(FakeLB);
2154 OI.Inputs.insert(FakeUB);
2155 OI.Inputs.insert(FakeStep);
2156 if (TaskContextStructPtrVal)
2157 OI.Inputs.insert(TaskContextStructPtrVal);
2158 assert(((TaskContextStructPtrVal && DupCB) ||
2159 (!TaskContextStructPtrVal && !DupCB)) &&
2160 "Task context struct ptr and duplication callback must be both set "
2161 "or both null");
2162
2163 // It isn't safe to run the duplication bodygen callback inside the post
2164 // outlining callback so this has to be run now before we know the real task
2165 // shareds structure type.
2166 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2167 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2168 Type *FakeSharedsTy = StructType::get(
2169 Builder.getContext(),
2170 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2171 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2172 FakeSharedsTy,
2173 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2174 if (!TaskDupFnOrErr) {
2175 return TaskDupFnOrErr.takeError();
2176 }
2177 Value *TaskDupFn = *TaskDupFnOrErr;
2178
2179 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2180 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2181 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2182 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2183 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2184 // Replace the Stale CI by appropriate RTL function call.
2185 assert(OutlinedFn.hasOneUse() &&
2186 "there must be a single user for the outlined function");
2187 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2188
2189 /* Create the casting for the Bounds Values that can be used when outlining
2190 * to replace the uses of the fakes with real values */
2191 BasicBlock *CodeReplBB = StaleCI->getParent();
2192 IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
2193 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2194 Value *CastedLBVal =
2195 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2196 Value *CastedUBVal =
2197 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2198 Value *CastedStepVal =
2199 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2200 Builder.restoreIP(CurrentIp);
2201
2202 Builder.SetInsertPoint(StaleCI);
2203
2204 // Gather the arguments for emitting the runtime call for
2205 // @__kmpc_omp_task_alloc
2206 Function *TaskAllocFn =
2207 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2208
2209 Value *ThreadID = getOrCreateThreadID(Ident);
2210
2211 if (!NoGroup) {
2212 // Emit runtime call for @__kmpc_taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216 }
2217
2218 // `flags` Argument Configuration
2219 // Task is tied if (Flags & 1) == 1.
2220 // Task is untied if (Flags & 1) == 0.
2221 // Task is final if (Flags & 2) == 2.
2222 // Task is not final if (Flags & 2) == 0.
2223 // Task is mergeable if (Flags & 4) == 4.
2224 // Task is not mergeable if (Flags & 4) == 0.
2225 // Task is priority if (Flags & 32) == 32.
2226 // Task is not priority if (Flags & 32) == 0.
2227 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2228 if (Final)
2229 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2230 if (Mergeable)
2231 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2232 if (Priority)
2233 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2234
2235 Value *TaskSize = Builder.getInt64(
2236 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2237
2238 AllocaInst *ArgStructAlloca =
2240 assert(ArgStructAlloca &&
2241 "Unable to find the alloca instruction corresponding to arguments "
2242 "for extracted function");
2243 std::optional<TypeSize> ArgAllocSize =
2244 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2245 assert(ArgAllocSize &&
2246 "Unable to determine size of arguments for extracted function");
2247 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2248
2249 // Emit the @__kmpc_omp_task_alloc runtime call
2250 // The runtime call returns a pointer to an area where the task captured
2251 // variables must be copied before the task is run (TaskData)
2252 CallInst *TaskData = Builder.CreateCall(
2253 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2254 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2255 /*task_func=*/&OutlinedFn});
2256
2257 Value *Shareds = StaleCI->getArgOperand(1);
2258 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2259 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2260 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2261 SharedsSize);
2262 // Get the pointer to loop lb, ub, step from task ptr
2263 // and set up the lowerbound,upperbound and step values
2264 llvm::Value *Lb = Builder.CreateGEP(
2265 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2266
2267 llvm::Value *Ub = Builder.CreateGEP(
2268 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2269
2270 llvm::Value *Step = Builder.CreateGEP(
2271 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2272 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2273
2274 // set up the arguments for emitting kmpc_taskloop runtime call
2275 // setting values for ifval, nogroup, sched, grainsize, task_dup
2276 Value *IfCondVal =
2277 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2278 : Builder.getInt32(1);
2279 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2280 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2281 Value *NoGroupVal = Builder.getInt32(1);
2282 Value *SchedVal = Builder.getInt32(Sched);
2283 Value *GrainSizeVal =
2284 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2285 : Builder.getInt64(0);
2286 Value *TaskDup = TaskDupFn;
2287
2288 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2289 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2290
2291 // taskloop runtime call
2292 Function *TaskloopFn =
2293 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2294 Builder.CreateCall(TaskloopFn, Args);
2295
2296 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2297 // nogroup is not defined
2298 if (!NoGroup) {
2299 Function *EndTaskgroupFn =
2300 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2301 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2302 }
2303
2304 StaleCI->eraseFromParent();
2305
2306 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2307
2308 LoadInst *SharedsOutlined =
2309 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2310 OutlinedFn.getArg(1)->replaceUsesWithIf(
2311 SharedsOutlined,
2312 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2313
2314 Value *IV = CLI->getIndVar();
2315 Type *IVTy = IV->getType();
2316 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2317
2318 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2319 // UpperBound. These GEP's can be reused for loading the tasks respective
2320 // bounds.
2321 Value *TaskLB = nullptr;
2322 Value *TaskUB = nullptr;
2323 Value *LoadTaskLB = nullptr;
2324 Value *LoadTaskUB = nullptr;
2325 for (Instruction &I : *TaskloopAllocaBB) {
2326 if (I.getOpcode() == Instruction::GetElementPtr) {
2327 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2328 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2329 switch (CI->getZExtValue()) {
2330 case 0:
2331 TaskLB = &I;
2332 break;
2333 case 1:
2334 TaskUB = &I;
2335 break;
2336 }
2337 }
2338 } else if (I.getOpcode() == Instruction::Load) {
2339 LoadInst &Load = cast<LoadInst>(I);
2340 if (Load.getPointerOperand() == TaskLB) {
2341 assert(TaskLB != nullptr && "Expected value for TaskLB");
2342 LoadTaskLB = &I;
2343 } else if (Load.getPointerOperand() == TaskUB) {
2344 assert(TaskUB != nullptr && "Expected value for TaskUB");
2345 LoadTaskUB = &I;
2346 }
2347 }
2348 }
2349
2350 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2351
2352 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2353 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2354 Value *TripCountMinusOne =
2355 Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
2356 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2357 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2358 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2359 // set the trip count in the CLI
2360 CLI->setTripCount(CastedTripCount);
2361
2362 Builder.SetInsertPoint(CLI->getBody(),
2363 CLI->getBody()->getFirstInsertionPt());
2364
2365 if (NumOfCollapseLoops > 1) {
2366 llvm::SmallVector<User *> UsersToReplace;
2367 // When using the collapse clause, the bounds of the loop have to be
2368 // adjusted to properly represent the iterator of the outer loop.
2369 Value *IVPlusTaskLB = Builder.CreateAdd(
2370 CLI->getIndVar(),
2371 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2372 // To ensure every Use is correctly captured, we first want to record
2373 // which users to replace the value in, and then replace the value.
2374 for (auto IVUse = CLI->getIndVar()->uses().begin();
2375 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2376 User *IVUser = IVUse->getUser();
2377 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2378 if (Op->getOpcode() == Instruction::URem ||
2379 Op->getOpcode() == Instruction::UDiv) {
2380 UsersToReplace.push_back(IVUser);
2381 }
2382 }
2383 }
2384 for (User *User : UsersToReplace) {
2385 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2386 }
2387 } else {
2388 // The canonical loop is generated with a fixed lower bound. We need to
2389 // update the index calculation code to use the task's lower bound. The
2390 // generated code looks like this:
2391 // %omp_loop.iv = phi ...
2392 // ...
2393 // %tmp = mul [type] %omp_loop.iv, step
2394 // %user_index = add [type] tmp, lb
2395 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2396 // of the normalised induction variable:
2397 // 1. This one: converting the normalised IV to the user IV
2398 // 2. The increment (add)
2399 // 3. The comparison against the trip count (icmp)
2400 // (1) is the only use that is a mul followed by an add so this cannot
2401 // match other IR.
2402 assert(CLI->getIndVar()->getNumUses() == 3 &&
2403 "Canonical loop should have exactly three uses of the ind var");
2404 for (User *IVUser : CLI->getIndVar()->users()) {
2405 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2406 if (Mul->getOpcode() == Instruction::Mul) {
2407 for (User *MulUser : Mul->users()) {
2408 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2409 if (Add->getOpcode() == Instruction::Add) {
2410 Add->setOperand(1, CastedTaskLB);
2411 }
2412 }
2413 }
2414 }
2415 }
2416 }
2417 }
2418
2419 FakeLB->replaceAllUsesWith(CastedLBVal);
2420 FakeUB->replaceAllUsesWith(CastedUBVal);
2421 FakeStep->replaceAllUsesWith(CastedStepVal);
2422 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2423 I->eraseFromParent();
2424 }
2425 };
2426
2427 addOutlineInfo(std::move(OI));
2428 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2429 return Builder.saveIP();
2430}
2431
2434 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2435 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2436 llvm::Type::getInt32Ty(M.getContext()));
2437}
2438
2440 const LocationDescription &Loc, InsertPointTy AllocaIP,
2441 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2442 SmallVector<DependData> Dependencies, AffinityData Affinities,
2443 bool Mergeable, Value *EventHandle, Value *Priority) {
2444
2445 if (!updateToLocation(Loc))
2446 return InsertPointTy();
2447
2448 uint32_t SrcLocStrSize;
2449 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2450 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2451 // The current basic block is split into four basic blocks. After outlining,
2452 // they will be mapped as follows:
2453 // ```
2454 // def current_fn() {
2455 // current_basic_block:
2456 // br label %task.exit
2457 // task.exit:
2458 // ; instructions after task
2459 // }
2460 // def outlined_fn() {
2461 // task.alloca:
2462 // br label %task.body
2463 // task.body:
2464 // ret void
2465 // }
2466 // ```
2467 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2468 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2469 BasicBlock *TaskAllocaBB =
2470 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2471
2472 InsertPointTy TaskAllocaIP =
2473 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2474 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2475 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2476 return Err;
2477
2478 OutlineInfo OI;
2479 OI.EntryBB = TaskAllocaBB;
2480 OI.OuterAllocaBB = AllocaIP.getBlock();
2481 OI.ExitBB = TaskExitBB;
2482
2483 // Add the thread ID argument.
2486 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2487
2488 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2489 Affinities, Mergeable, Priority, EventHandle,
2490 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
2491 // Replace the Stale CI by appropriate RTL function call.
2492 assert(OutlinedFn.hasOneUse() &&
2493 "there must be a single user for the outlined function");
2494 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2495
2496 // HasShareds is true if any variables are captured in the outlined region,
2497 // false otherwise.
2498 bool HasShareds = StaleCI->arg_size() > 1;
2499 Builder.SetInsertPoint(StaleCI);
2500
2501 // Gather the arguments for emitting the runtime call for
2502 // @__kmpc_omp_task_alloc
2503 Function *TaskAllocFn =
2504 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2505
2506 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2507 // call.
2508 Value *ThreadID = getOrCreateThreadID(Ident);
2509
2510 // Argument - `flags`
2511 // Task is tied iff (Flags & 1) == 1.
2512 // Task is untied iff (Flags & 1) == 0.
2513 // Task is final iff (Flags & 2) == 2.
2514 // Task is not final iff (Flags & 2) == 0.
2515 // Task is mergeable iff (Flags & 4) == 4.
2516 // Task is not mergeable iff (Flags & 4) == 0.
2517 // Task is priority iff (Flags & 32) == 32.
2518 // Task is not priority iff (Flags & 32) == 0.
2519 // TODO: Handle the other flags.
2520 Value *Flags = Builder.getInt32(Tied);
2521 if (Final) {
2522 Value *FinalFlag =
2523 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2524 Flags = Builder.CreateOr(FinalFlag, Flags);
2525 }
2526
2527 if (Mergeable)
2528 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2529 if (Priority)
2530 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2531
2532 // Argument - `sizeof_kmp_task_t` (TaskSize)
2533 // Tasksize refers to the size in bytes of kmp_task_t data structure
2534 // including private vars accessed in task.
2535 // TODO: add kmp_task_t_with_privates (privates)
2536 Value *TaskSize = Builder.getInt64(
2537 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2538
2539 // Argument - `sizeof_shareds` (SharedsSize)
2540 // SharedsSize refers to the shareds array size in the kmp_task_t data
2541 // structure.
2542 Value *SharedsSize = Builder.getInt64(0);
2543 if (HasShareds) {
2544 AllocaInst *ArgStructAlloca =
2546 assert(ArgStructAlloca &&
2547 "Unable to find the alloca instruction corresponding to arguments "
2548 "for extracted function");
2549 std::optional<TypeSize> ArgAllocSize =
2550 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2551 assert(ArgAllocSize &&
2552 "Unable to determine size of arguments for extracted function");
2553 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2554 }
2555 // Emit the @__kmpc_omp_task_alloc runtime call
2556 // The runtime call returns a pointer to an area where the task captured
2557 // variables must be copied before the task is run (TaskData)
2559 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2560 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2561 /*task_func=*/&OutlinedFn});
2562
2563 if (Affinities.Count && Affinities.Info) {
2565 OMPRTL___kmpc_omp_reg_task_with_affinity);
2566
2567 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2568 Affinities.Count, Affinities.Info});
2569 }
2570
2571 // Emit detach clause initialization.
2572 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2573 // task_descriptor);
2574 if (EventHandle) {
2576 OMPRTL___kmpc_task_allow_completion_event);
2577 llvm::Value *EventVal =
2578 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2579 llvm::Value *EventHandleAddr =
2580 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2581 Builder.getPtrTy(0));
2582 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2583 Builder.CreateStore(EventVal, EventHandleAddr);
2584 }
2585 // Copy the arguments for outlined function
2586 if (HasShareds) {
2587 Value *Shareds = StaleCI->getArgOperand(1);
2588 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2589 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2590 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2591 SharedsSize);
2592 }
2593
2594 if (Priority) {
2595 //
2596 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2597 // we populate the priority information into the "kmp_task_t" here
2598 //
2599 // The struct "kmp_task_t" definition is available in kmp.h
2600 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2601 // data2 is used for priority
2602 //
2603 Type *Int32Ty = Builder.getInt32Ty();
2604 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2605 // kmp_task_t* => { ptr }
2606 Type *TaskPtr = StructType::get(VoidPtr);
2607 Value *TaskGEP =
2608 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2609 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2610 Type *TaskStructType = StructType::get(
2611 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2612 Value *PriorityData = Builder.CreateInBoundsGEP(
2613 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2614 // kmp_cmplrdata_t => { ptr, ptr }
2615 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2616 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2617 PriorityData, {Zero, Zero});
2618 Builder.CreateStore(Priority, CmplrData);
2619 }
2620
2621 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2622
2623 // In the presence of the `if` clause, the following IR is generated:
2624 // ...
2625 // %data = call @__kmpc_omp_task_alloc(...)
2626 // br i1 %if_condition, label %then, label %else
2627 // then:
2628 // call @__kmpc_omp_task(...)
2629 // br label %exit
2630 // else:
2631 // ;; Wait for resolution of dependencies, if any, before
2632 // ;; beginning the task
2633 // call @__kmpc_omp_wait_deps(...)
2634 // call @__kmpc_omp_task_begin_if0(...)
2635 // call @outlined_fn(...)
2636 // call @__kmpc_omp_task_complete_if0(...)
2637 // br label %exit
2638 // exit:
2639 // ...
2640 if (IfCondition) {
2641 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2642 // terminator.
2643 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2644 Instruction *IfTerminator =
2645 Builder.GetInsertPoint()->getParent()->getTerminator();
2646 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2647 Builder.SetInsertPoint(IfTerminator);
2648 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2649 &ElseTI);
2650 Builder.SetInsertPoint(ElseTI);
2651
2652 if (Dependencies.size()) {
2653 Function *TaskWaitFn =
2654 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2656 TaskWaitFn,
2657 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2658 ConstantInt::get(Builder.getInt32Ty(), 0),
2660 }
2661 Function *TaskBeginFn =
2662 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2663 Function *TaskCompleteFn =
2664 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2665 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2666 CallInst *CI = nullptr;
2667 if (HasShareds)
2668 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2669 else
2670 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2671 CI->setDebugLoc(StaleCI->getDebugLoc());
2672 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2673 Builder.SetInsertPoint(ThenTI);
2674 }
2675
2676 if (Dependencies.size()) {
2677 Function *TaskFn =
2678 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2680 TaskFn,
2681 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2682 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2684
2685 } else {
2686 // Emit the @__kmpc_omp_task runtime call to spawn the task
2687 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2688 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2689 }
2690
2691 StaleCI->eraseFromParent();
2692
2693 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2694 if (HasShareds) {
2695 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2696 OutlinedFn.getArg(1)->replaceUsesWithIf(
2697 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2698 }
2699
2700 for (Instruction *I : llvm::reverse(ToBeDeleted))
2701 I->eraseFromParent();
2702 };
2703
2704 addOutlineInfo(std::move(OI));
2705 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2706
2707 return Builder.saveIP();
2708}
2709
2712 InsertPointTy AllocaIP,
2713 BodyGenCallbackTy BodyGenCB) {
2714 if (!updateToLocation(Loc))
2715 return InsertPointTy();
2716
2717 uint32_t SrcLocStrSize;
2718 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2719 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2720 Value *ThreadID = getOrCreateThreadID(Ident);
2721
2722 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2723 Function *TaskgroupFn =
2724 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2725 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2726
2727 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2728 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2729 return Err;
2730
2731 Builder.SetInsertPoint(TaskgroupExitBB);
2732 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2733 Function *EndTaskgroupFn =
2734 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2735 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2736
2737 return Builder.saveIP();
2738}
2739
2741 const LocationDescription &Loc, InsertPointTy AllocaIP,
2743 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2744 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2745
2746 if (!updateToLocation(Loc))
2747 return Loc.IP;
2748
2749 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2750
2751 // Each section is emitted as a switch case
2752 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2753 // -> OMP.createSection() which generates the IR for each section
2754 // Iterate through all sections and emit a switch construct:
2755 // switch (IV) {
2756 // case 0:
2757 // <SectionStmt[0]>;
2758 // break;
2759 // ...
2760 // case <NumSection> - 1:
2761 // <SectionStmt[<NumSection> - 1]>;
2762 // break;
2763 // }
2764 // ...
2765 // section_loop.after:
2766 // <FiniCB>;
2767 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2768 Builder.restoreIP(CodeGenIP);
2770 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2771 Function *CurFn = Continue->getParent();
2772 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2773
2774 unsigned CaseNumber = 0;
2775 for (auto SectionCB : SectionCBs) {
2777 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2778 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2779 Builder.SetInsertPoint(CaseBB);
2780 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
2781 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2782 CaseEndBr->getIterator()}))
2783 return Err;
2784 CaseNumber++;
2785 }
2786 // remove the existing terminator from body BB since there can be no
2787 // terminators after switch/case
2788 return Error::success();
2789 };
2790 // Loop body ends here
2791 // LowerBound, UpperBound, and STride for createCanonicalLoop
2792 Type *I32Ty = Type::getInt32Ty(M.getContext());
2793 Value *LB = ConstantInt::get(I32Ty, 0);
2794 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2795 Value *ST = ConstantInt::get(I32Ty, 1);
2797 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2798 if (!LoopInfo)
2799 return LoopInfo.takeError();
2800
2801 InsertPointOrErrorTy WsloopIP =
2802 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2803 WorksharingLoopType::ForStaticLoop, !IsNowait);
2804 if (!WsloopIP)
2805 return WsloopIP.takeError();
2806 InsertPointTy AfterIP = *WsloopIP;
2807
2808 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2809 assert(LoopFini && "Bad structure of static workshare loop finalization");
2810
2811 // Apply the finalization callback in LoopAfterBB
2812 auto FiniInfo = FinalizationStack.pop_back_val();
2813 assert(FiniInfo.DK == OMPD_sections &&
2814 "Unexpected finalization stack state!");
2815 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2816 return Err;
2817
2818 return AfterIP;
2819}
2820
2823 BodyGenCallbackTy BodyGenCB,
2824 FinalizeCallbackTy FiniCB) {
2825 if (!updateToLocation(Loc))
2826 return Loc.IP;
2827
2828 auto FiniCBWrapper = [&](InsertPointTy IP) {
2829 if (IP.getBlock()->end() != IP.getPoint())
2830 return FiniCB(IP);
2831 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2832 // will fail because that function requires the Finalization Basic Block to
2833 // have a terminator, which is already removed by EmitOMPRegionBody.
2834 // IP is currently at cancelation block.
2835 // We need to backtrack to the condition block to fetch
2836 // the exit block and create a branch from cancelation
2837 // to exit block.
2839 Builder.restoreIP(IP);
2840 auto *CaseBB = Loc.IP.getBlock();
2841 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2842 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2843 Instruction *I = Builder.CreateBr(ExitBB);
2844 IP = InsertPointTy(I->getParent(), I->getIterator());
2845 return FiniCB(IP);
2846 };
2847
2848 Directive OMPD = Directive::OMPD_sections;
2849 // Since we are using Finalization Callback here, HasFinalize
2850 // and IsCancellable have to be true
2851 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2852 /*Conditional*/ false, /*hasFinalize*/ true,
2853 /*IsCancellable*/ true);
2854}
2855
2861
2862Value *OpenMPIRBuilder::getGPUThreadID() {
2865 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2866 {});
2867}
2868
2869Value *OpenMPIRBuilder::getGPUWarpSize() {
2871 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2872}
2873
2874Value *OpenMPIRBuilder::getNVPTXWarpID() {
2875 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2876 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2877}
2878
2879Value *OpenMPIRBuilder::getNVPTXLaneID() {
2880 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2881 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2882 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2883 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2884 "nvptx_lane_id");
2885}
2886
2887Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2888 Type *ToType) {
2889 Type *FromType = From->getType();
2890 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2891 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2892 assert(FromSize > 0 && "From size must be greater than zero");
2893 assert(ToSize > 0 && "To size must be greater than zero");
2894 if (FromType == ToType)
2895 return From;
2896 if (FromSize == ToSize)
2897 return Builder.CreateBitCast(From, ToType);
2898 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2899 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2900 InsertPointTy SaveIP = Builder.saveIP();
2901 Builder.restoreIP(AllocaIP);
2902 Value *CastItem = Builder.CreateAlloca(ToType);
2903 Builder.restoreIP(SaveIP);
2904
2905 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2906 CastItem, Builder.getPtrTy(0));
2907 Builder.CreateStore(From, ValCastItem);
2908 return Builder.CreateLoad(ToType, CastItem);
2909}
2910
2911Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2912 Value *Element,
2913 Type *ElementType,
2914 Value *Offset) {
2915 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2916 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2917
2918 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2919 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2920 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2921 Value *WarpSize =
2922 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2924 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2925 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2926 Value *WarpSizeCast =
2927 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2928 Value *ShuffleCall =
2929 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2930 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2931}
2932
2933void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2934 Value *DstAddr, Type *ElemType,
2935 Value *Offset, Type *ReductionArrayTy,
2936 bool IsByRefElem) {
2937 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2938 // Create the loop over the big sized data.
2939 // ptr = (void*)Elem;
2940 // ptrEnd = (void*) Elem + 1;
2941 // Step = 8;
2942 // while (ptr + Step < ptrEnd)
2943 // shuffle((int64_t)*ptr);
2944 // Step = 4;
2945 // while (ptr + Step < ptrEnd)
2946 // shuffle((int32_t)*ptr);
2947 // ...
2948 Type *IndexTy = Builder.getIndexTy(
2949 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2950 Value *ElemPtr = DstAddr;
2951 Value *Ptr = SrcAddr;
2952 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2953 if (Size < IntSize)
2954 continue;
2955 Type *IntType = Builder.getIntNTy(IntSize * 8);
2956 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2957 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2958 Value *SrcAddrGEP =
2959 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2960 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2961 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2962
2963 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2964 if ((Size / IntSize) > 1) {
2965 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2966 SrcAddrGEP, Builder.getPtrTy());
2967 BasicBlock *PreCondBB =
2968 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2969 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2970 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2971 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2972 emitBlock(PreCondBB, CurFunc);
2973 PHINode *PhiSrc =
2974 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2975 PhiSrc->addIncoming(Ptr, CurrentBB);
2976 PHINode *PhiDest =
2977 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2978 PhiDest->addIncoming(ElemPtr, CurrentBB);
2979 Ptr = PhiSrc;
2980 ElemPtr = PhiDest;
2981 Value *PtrDiff = Builder.CreatePtrDiff(
2982 Builder.getInt8Ty(), PtrEnd,
2983 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2984 Builder.CreateCondBr(
2985 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2986 ExitBB);
2987 emitBlock(ThenBB, CurFunc);
2988 Value *Res = createRuntimeShuffleFunction(
2989 AllocaIP,
2990 Builder.CreateAlignedLoad(
2991 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2992 IntType, Offset);
2993 Builder.CreateAlignedStore(Res, ElemPtr,
2994 M.getDataLayout().getPrefTypeAlign(ElemType));
2995 Value *LocalPtr =
2996 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2997 Value *LocalElemPtr =
2998 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2999 PhiSrc->addIncoming(LocalPtr, ThenBB);
3000 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3001 emitBranch(PreCondBB);
3002 emitBlock(ExitBB, CurFunc);
3003 } else {
3004 Value *Res = createRuntimeShuffleFunction(
3005 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3006 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3007 Res->getType()->getScalarSizeInBits())
3008 Res = Builder.CreateTrunc(Res, ElemType);
3009 Builder.CreateStore(Res, ElemPtr);
3010 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3011 ElemPtr =
3012 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3013 }
3014 Size = Size % IntSize;
3015 }
3016}
3017
3018Error OpenMPIRBuilder::emitReductionListCopy(
3019 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3020 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3021 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3022 Type *IndexTy = Builder.getIndexTy(
3023 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3024 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3025
3026 // Iterates, element-by-element, through the source Reduce list and
3027 // make a copy.
3028 for (auto En : enumerate(ReductionInfos)) {
3029 const ReductionInfo &RI = En.value();
3030 Value *SrcElementAddr = nullptr;
3031 AllocaInst *DestAlloca = nullptr;
3032 Value *DestElementAddr = nullptr;
3033 Value *DestElementPtrAddr = nullptr;
3034 // Should we shuffle in an element from a remote lane?
3035 bool ShuffleInElement = false;
3036 // Set to true to update the pointer in the dest Reduce list to a
3037 // newly created element.
3038 bool UpdateDestListPtr = false;
3039
3040 // Step 1.1: Get the address for the src element in the Reduce list.
3041 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3042 ReductionArrayTy, SrcBase,
3043 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3044 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3045
3046 // Step 1.2: Create a temporary to store the element in the destination
3047 // Reduce list.
3048 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3049 ReductionArrayTy, DestBase,
3050 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3051 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3052 switch (Action) {
3054 InsertPointTy CurIP = Builder.saveIP();
3055 Builder.restoreIP(AllocaIP);
3056
3057 Type *DestAllocaType =
3058 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3059 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3060 ".omp.reduction.element");
3061 DestAlloca->setAlignment(
3062 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3063 DestElementAddr = DestAlloca;
3064 DestElementAddr =
3065 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3066 DestElementAddr->getName() + ".ascast");
3067 Builder.restoreIP(CurIP);
3068 ShuffleInElement = true;
3069 UpdateDestListPtr = true;
3070 break;
3071 }
3073 DestElementAddr =
3074 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3075 break;
3076 }
3077 }
3078
3079 // Now that all active lanes have read the element in the
3080 // Reduce list, shuffle over the value from the remote lane.
3081 if (ShuffleInElement) {
3082 Type *ShuffleType = RI.ElementType;
3083 Value *ShuffleSrcAddr = SrcElementAddr;
3084 Value *ShuffleDestAddr = DestElementAddr;
3085 AllocaInst *LocalStorage = nullptr;
3086
3087 if (IsByRefElem) {
3088 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3089 assert(RI.ByRefAllocatedType &&
3090 "Expected by-ref allocated type to be set");
3091 // For by-ref reductions, we need to copy from the remote lane the
3092 // actual value of the partial reduction computed by that remote lane;
3093 // rather than, for example, a pointer to that data or, even worse, a
3094 // pointer to the descriptor of the by-ref reduction element.
3095 ShuffleType = RI.ByRefElementType;
3096
3097 InsertPointOrErrorTy GenResult =
3098 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3099
3100 if (!GenResult)
3101 return GenResult.takeError();
3102
3103 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3104
3105 {
3106 InsertPointTy OldIP = Builder.saveIP();
3107 Builder.restoreIP(AllocaIP);
3108
3109 LocalStorage = Builder.CreateAlloca(ShuffleType);
3110 Builder.restoreIP(OldIP);
3111 ShuffleDestAddr = LocalStorage;
3112 }
3113 }
3114
3115 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3116 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3117
3118 if (IsByRefElem) {
3119 // Copy descriptor from source and update base_ptr to shuffled data
3120 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3121 DestAlloca, Builder.getPtrTy(), ".ascast");
3122
3123 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3124 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3125 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3126
3127 if (!GenResult)
3128 return GenResult.takeError();
3129 }
3130 } else {
3131 switch (RI.EvaluationKind) {
3132 case EvalKind::Scalar: {
3133 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3134 // Store the source element value to the dest element address.
3135 Builder.CreateStore(Elem, DestElementAddr);
3136 break;
3137 }
3138 case EvalKind::Complex: {
3139 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3140 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3141 Value *SrcReal = Builder.CreateLoad(
3142 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3143 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3144 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3145 Value *SrcImg = Builder.CreateLoad(
3146 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3147
3148 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3149 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3150 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3151 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3152 Builder.CreateStore(SrcReal, DestRealPtr);
3153 Builder.CreateStore(SrcImg, DestImgPtr);
3154 break;
3155 }
3156 case EvalKind::Aggregate: {
3157 Value *SizeVal = Builder.getInt64(
3158 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3159 Builder.CreateMemCpy(
3160 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3161 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3162 SizeVal, false);
3163 break;
3164 }
3165 };
3166 }
3167
3168 // Step 3.1: Modify reference in dest Reduce list as needed.
3169 // Modifying the reference in Reduce list to point to the newly
3170 // created element. The element is live in the current function
3171 // scope and that of functions it invokes (i.e., reduce_function).
3172 // RemoteReduceData[i] = (void*)&RemoteElem
3173 if (UpdateDestListPtr) {
3174 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3175 DestElementAddr, Builder.getPtrTy(),
3176 DestElementAddr->getName() + ".ascast");
3177 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3178 }
3179 }
3180
3181 return Error::success();
3182}
3183
3184Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3185 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3186 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3187 InsertPointTy SavedIP = Builder.saveIP();
3188 LLVMContext &Ctx = M.getContext();
3189 FunctionType *FuncTy = FunctionType::get(
3190 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3191 /* IsVarArg */ false);
3192 Function *WcFunc =
3194 "_omp_reduction_inter_warp_copy_func", &M);
3195 WcFunc->setAttributes(FuncAttrs);
3196 WcFunc->addParamAttr(0, Attribute::NoUndef);
3197 WcFunc->addParamAttr(1, Attribute::NoUndef);
3198 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3199 Builder.SetInsertPoint(EntryBB);
3200
3201 // ReduceList: thread local Reduce list.
3202 // At the stage of the computation when this function is called, partially
3203 // aggregated values reside in the first lane of every active warp.
3204 Argument *ReduceListArg = WcFunc->getArg(0);
3205 // NumWarps: number of warps active in the parallel region. This could
3206 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3207 Argument *NumWarpsArg = WcFunc->getArg(1);
3208
3209 // This array is used as a medium to transfer, one reduce element at a time,
3210 // the data from the first lane of every warp to lanes in the first warp
3211 // in order to perform the final step of a reduction in a parallel region
3212 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3213 // for reduced latency, as well as to have a distinct copy for concurrently
3214 // executing target regions. The array is declared with common linkage so
3215 // as to be shared across compilation units.
3216 StringRef TransferMediumName =
3217 "__openmp_nvptx_data_transfer_temporary_storage";
3218 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3219 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3220 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3221 if (!TransferMedium) {
3222 TransferMedium = new GlobalVariable(
3223 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3224 UndefValue::get(ArrayTy), TransferMediumName,
3225 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3226 /*AddressSpace=*/3);
3227 }
3228
3229 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3230 Value *GPUThreadID = getGPUThreadID();
3231 // nvptx_lane_id = nvptx_id % warpsize
3232 Value *LaneID = getNVPTXLaneID();
3233 // nvptx_warp_id = nvptx_id / warpsize
3234 Value *WarpID = getNVPTXWarpID();
3235
3236 InsertPointTy AllocaIP =
3237 InsertPointTy(Builder.GetInsertBlock(),
3238 Builder.GetInsertBlock()->getFirstInsertionPt());
3239 Type *Arg0Type = ReduceListArg->getType();
3240 Type *Arg1Type = NumWarpsArg->getType();
3241 Builder.restoreIP(AllocaIP);
3242 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3243 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3244 AllocaInst *NumWarpsAlloca =
3245 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3246 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3247 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3248 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3249 NumWarpsAlloca, Builder.getPtrTy(0),
3250 NumWarpsAlloca->getName() + ".ascast");
3251 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3252 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3253 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3254 InsertPointTy CodeGenIP =
3255 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3256 Builder.restoreIP(CodeGenIP);
3257
3258 Value *ReduceList =
3259 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3260
3261 for (auto En : enumerate(ReductionInfos)) {
3262 //
3263 // Warp master copies reduce element to transfer medium in __shared__
3264 // memory.
3265 //
3266 const ReductionInfo &RI = En.value();
3267 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3268 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3269 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3270 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3271 Type *CType = Builder.getIntNTy(TySize * 8);
3272
3273 unsigned NumIters = RealTySize / TySize;
3274 if (NumIters == 0)
3275 continue;
3276 Value *Cnt = nullptr;
3277 Value *CntAddr = nullptr;
3278 BasicBlock *PrecondBB = nullptr;
3279 BasicBlock *ExitBB = nullptr;
3280 if (NumIters > 1) {
3281 CodeGenIP = Builder.saveIP();
3282 Builder.restoreIP(AllocaIP);
3283 CntAddr =
3284 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3285
3286 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3287 CntAddr->getName() + ".ascast");
3288 Builder.restoreIP(CodeGenIP);
3289 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3290 CntAddr,
3291 /*Volatile=*/false);
3292 PrecondBB = BasicBlock::Create(Ctx, "precond");
3293 ExitBB = BasicBlock::Create(Ctx, "exit");
3294 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3295 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3296 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3297 /*Volatile=*/false);
3298 Value *Cmp = Builder.CreateICmpULT(
3299 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3300 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3301 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3302 }
3303
3304 // kmpc_barrier.
3305 InsertPointOrErrorTy BarrierIP1 =
3306 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3307 omp::Directive::OMPD_unknown,
3308 /* ForceSimpleCall */ false,
3309 /* CheckCancelFlag */ true);
3310 if (!BarrierIP1)
3311 return BarrierIP1.takeError();
3312 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3313 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3314 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3315
3316 // if (lane_id == 0)
3317 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3318 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3319 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3320
3321 // Reduce element = LocalReduceList[i]
3322 auto *RedListArrayTy =
3323 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3324 Type *IndexTy = Builder.getIndexTy(
3325 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3326 Value *ElemPtrPtr =
3327 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3328 {ConstantInt::get(IndexTy, 0),
3329 ConstantInt::get(IndexTy, En.index())});
3330 // elemptr = ((CopyType*)(elemptrptr)) + I
3331 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3332
3333 if (IsByRefElem) {
3334 InsertPointOrErrorTy GenRes =
3335 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3336
3337 if (!GenRes)
3338 return GenRes.takeError();
3339
3340 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3341 }
3342
3343 if (NumIters > 1)
3344 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3345
3346 // Get pointer to location in transfer medium.
3347 // MediumPtr = &medium[warp_id]
3348 Value *MediumPtr = Builder.CreateInBoundsGEP(
3349 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3350 // elem = *elemptr
3351 //*MediumPtr = elem
3352 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3353 // Store the source element value to the dest element address.
3354 Builder.CreateStore(Elem, MediumPtr,
3355 /*IsVolatile*/ true);
3356 Builder.CreateBr(MergeBB);
3357
3358 // else
3359 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3360 Builder.CreateBr(MergeBB);
3361
3362 // endif
3363 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3364 InsertPointOrErrorTy BarrierIP2 =
3365 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3366 omp::Directive::OMPD_unknown,
3367 /* ForceSimpleCall */ false,
3368 /* CheckCancelFlag */ true);
3369 if (!BarrierIP2)
3370 return BarrierIP2.takeError();
3371
3372 // Warp 0 copies reduce element from transfer medium
3373 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3374 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3375 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3376
3377 Value *NumWarpsVal =
3378 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3379 // Up to 32 threads in warp 0 are active.
3380 Value *IsActiveThread =
3381 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3382 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3383
3384 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3385
3386 // SecMediumPtr = &medium[tid]
3387 // SrcMediumVal = *SrcMediumPtr
3388 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3389 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3390 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3391 Value *TargetElemPtrPtr =
3392 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3393 {ConstantInt::get(IndexTy, 0),
3394 ConstantInt::get(IndexTy, En.index())});
3395 Value *TargetElemPtrVal =
3396 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3397 Value *TargetElemPtr = TargetElemPtrVal;
3398
3399 if (IsByRefElem) {
3400 InsertPointOrErrorTy GenRes =
3401 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3402
3403 if (!GenRes)
3404 return GenRes.takeError();
3405
3406 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3407 }
3408
3409 if (NumIters > 1)
3410 TargetElemPtr =
3411 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3412
3413 // *TargetElemPtr = SrcMediumVal;
3414 Value *SrcMediumValue =
3415 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3416 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3417 Builder.CreateBr(W0MergeBB);
3418
3419 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3420 Builder.CreateBr(W0MergeBB);
3421
3422 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3423
3424 if (NumIters > 1) {
3425 Cnt = Builder.CreateNSWAdd(
3426 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3427 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3428
3429 auto *CurFn = Builder.GetInsertBlock()->getParent();
3430 emitBranch(PrecondBB);
3431 emitBlock(ExitBB, CurFn);
3432 }
3433 RealTySize %= TySize;
3434 }
3435 }
3436
3437 Builder.CreateRetVoid();
3438 Builder.restoreIP(SavedIP);
3439
3440 return WcFunc;
3441}
3442
3443Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3444 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3445 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3446 LLVMContext &Ctx = M.getContext();
3447 FunctionType *FuncTy =
3448 FunctionType::get(Builder.getVoidTy(),
3449 {Builder.getPtrTy(), Builder.getInt16Ty(),
3450 Builder.getInt16Ty(), Builder.getInt16Ty()},
3451 /* IsVarArg */ false);
3452 Function *SarFunc =
3454 "_omp_reduction_shuffle_and_reduce_func", &M);
3455 SarFunc->setAttributes(FuncAttrs);
3456 SarFunc->addParamAttr(0, Attribute::NoUndef);
3457 SarFunc->addParamAttr(1, Attribute::NoUndef);
3458 SarFunc->addParamAttr(2, Attribute::NoUndef);
3459 SarFunc->addParamAttr(3, Attribute::NoUndef);
3460 SarFunc->addParamAttr(1, Attribute::SExt);
3461 SarFunc->addParamAttr(2, Attribute::SExt);
3462 SarFunc->addParamAttr(3, Attribute::SExt);
3463 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3464 Builder.SetInsertPoint(EntryBB);
3465
3466 // Thread local Reduce list used to host the values of data to be reduced.
3467 Argument *ReduceListArg = SarFunc->getArg(0);
3468 // Current lane id; could be logical.
3469 Argument *LaneIDArg = SarFunc->getArg(1);
3470 // Offset of the remote source lane relative to the current lane.
3471 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3472 // Algorithm version. This is expected to be known at compile time.
3473 Argument *AlgoVerArg = SarFunc->getArg(3);
3474
3475 Type *ReduceListArgType = ReduceListArg->getType();
3476 Type *LaneIDArgType = LaneIDArg->getType();
3477 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3478 Value *ReduceListAlloca = Builder.CreateAlloca(
3479 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3480 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3481 LaneIDArg->getName() + ".addr");
3482 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3483 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3484 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3485 AlgoVerArg->getName() + ".addr");
3486 ArrayType *RedListArrayTy =
3487 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3488
3489 // Create a local thread-private variable to host the Reduce list
3490 // from a remote lane.
3491 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3492 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3493
3494 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 ReduceListAlloca, ReduceListArgType,
3496 ReduceListAlloca->getName() + ".ascast");
3497 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3498 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3499 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3500 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3501 RemoteLaneOffsetAlloca->getName() + ".ascast");
3502 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3503 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3504 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3505 RemoteReductionListAlloca, Builder.getPtrTy(),
3506 RemoteReductionListAlloca->getName() + ".ascast");
3507
3508 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3509 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3510 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3511 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3512
3513 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3514 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3515 Value *RemoteLaneOffset =
3516 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3517 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3518
3519 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3520
3521 // This loop iterates through the list of reduce elements and copies,
3522 // element by element, from a remote lane in the warp to RemoteReduceList,
3523 // hosted on the thread's stack.
3524 Error EmitRedLsCpRes = emitReductionListCopy(
3525 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3526 ReduceList, RemoteListAddrCast, IsByRef,
3527 {RemoteLaneOffset, nullptr, nullptr});
3528
3529 if (EmitRedLsCpRes)
3530 return EmitRedLsCpRes;
3531
3532 // The actions to be performed on the Remote Reduce list is dependent
3533 // on the algorithm version.
3534 //
3535 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3536 // LaneId % 2 == 0 && Offset > 0):
3537 // do the reduction value aggregation
3538 //
3539 // The thread local variable Reduce list is mutated in place to host the
3540 // reduced data, which is the aggregated value produced from local and
3541 // remote lanes.
3542 //
3543 // Note that AlgoVer is expected to be a constant integer known at compile
3544 // time.
3545 // When AlgoVer==0, the first conjunction evaluates to true, making
3546 // the entire predicate true during compile time.
3547 // When AlgoVer==1, the second conjunction has only the second part to be
3548 // evaluated during runtime. Other conjunctions evaluates to false
3549 // during compile time.
3550 // When AlgoVer==2, the third conjunction has only the second part to be
3551 // evaluated during runtime. Other conjunctions evaluates to false
3552 // during compile time.
3553 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3554 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3555 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3556 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3557 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3558 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3559 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3560 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3561 Value *RemoteOffsetComp =
3562 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3563 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3564 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3565 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3566
3567 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3568 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3569 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3570
3571 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3572 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3573 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3574 ReduceList, Builder.getPtrTy());
3575 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3576 RemoteListAddrCast, Builder.getPtrTy());
3577 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3578 ->addFnAttr(Attribute::NoUnwind);
3579 Builder.CreateBr(MergeBB);
3580
3581 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3582 Builder.CreateBr(MergeBB);
3583
3584 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3585
3586 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3587 // Reduce list.
3588 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3589 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3590 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3591
3592 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3593 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3594 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3595 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3596
3597 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3598
3599 EmitRedLsCpRes = emitReductionListCopy(
3600 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3601 RemoteListAddrCast, ReduceList, IsByRef);
3602
3603 if (EmitRedLsCpRes)
3604 return EmitRedLsCpRes;
3605
3606 Builder.CreateBr(CpyMergeBB);
3607
3608 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3609 Builder.CreateBr(CpyMergeBB);
3610
3611 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3612
3613 Builder.CreateRetVoid();
3614
3615 return SarFunc;
3616}
3617
3619OpenMPIRBuilder::generateReductionDescriptor(
3620 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3621 Type *DescriptorType,
3622 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3623 DataPtrPtrGen) {
3624
3625 // Copy the source descriptor to preserve all metadata (rank, extents,
3626 // strides, etc.)
3627 Value *DescriptorSize =
3628 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3629 Builder.CreateMemCpy(
3630 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3631 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3632 DescriptorSize);
3633
3634 // Update the base pointer field to point to the local shuffled data
3635 Value *DataPtrField;
3636 InsertPointOrErrorTy GenResult =
3637 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3638
3639 if (!GenResult)
3640 return GenResult.takeError();
3641
3642 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3643 DataPtr, Builder.getPtrTy(), ".ascast"),
3644 DataPtrField);
3645
3646 return Builder.saveIP();
3647}
3648
3649Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3650 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3651 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3652 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3653 LLVMContext &Ctx = M.getContext();
3654 FunctionType *FuncTy = FunctionType::get(
3655 Builder.getVoidTy(),
3656 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3657 /* IsVarArg */ false);
3658 Function *LtGCFunc =
3660 "_omp_reduction_list_to_global_copy_func", &M);
3661 LtGCFunc->setAttributes(FuncAttrs);
3662 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3663 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3664 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3665
3666 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3667 Builder.SetInsertPoint(EntryBlock);
3668
3669 // Buffer: global reduction buffer.
3670 Argument *BufferArg = LtGCFunc->getArg(0);
3671 // Idx: index of the buffer.
3672 Argument *IdxArg = LtGCFunc->getArg(1);
3673 // ReduceList: thread local Reduce list.
3674 Argument *ReduceListArg = LtGCFunc->getArg(2);
3675
3676 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3677 BufferArg->getName() + ".addr");
3678 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3679 IdxArg->getName() + ".addr");
3680 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3681 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3682 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3683 BufferArgAlloca, Builder.getPtrTy(),
3684 BufferArgAlloca->getName() + ".ascast");
3685 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3686 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3687 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3688 ReduceListArgAlloca, Builder.getPtrTy(),
3689 ReduceListArgAlloca->getName() + ".ascast");
3690
3691 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3692 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3693 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3694
3695 Value *LocalReduceList =
3696 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3697 Value *BufferArgVal =
3698 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3699 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3700 Type *IndexTy = Builder.getIndexTy(
3701 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3702 for (auto En : enumerate(ReductionInfos)) {
3703 const ReductionInfo &RI = En.value();
3704 auto *RedListArrayTy =
3705 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3706 // Reduce element = LocalReduceList[i]
3707 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3708 RedListArrayTy, LocalReduceList,
3709 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3710 // elemptr = ((CopyType*)(elemptrptr)) + I
3711 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3712
3713 // Global = Buffer.VD[Idx];
3714 Value *BufferVD =
3715 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3716 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3717 ReductionsBufferTy, BufferVD, 0, En.index());
3718
3719 switch (RI.EvaluationKind) {
3720 case EvalKind::Scalar: {
3721 Value *TargetElement;
3722
3723 if (IsByRef.empty() || !IsByRef[En.index()]) {
3724 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3725 } else {
3726 InsertPointOrErrorTy GenResult =
3727 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3728
3729 if (!GenResult)
3730 return GenResult.takeError();
3731
3732 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3733 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3734 }
3735
3736 Builder.CreateStore(TargetElement, GlobVal);
3737 break;
3738 }
3739 case EvalKind::Complex: {
3740 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3741 RI.ElementType, ElemPtr, 0, 0, ".realp");
3742 Value *SrcReal = Builder.CreateLoad(
3743 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3744 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3745 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3746 Value *SrcImg = Builder.CreateLoad(
3747 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3748
3749 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3750 RI.ElementType, GlobVal, 0, 0, ".realp");
3751 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3752 RI.ElementType, GlobVal, 0, 1, ".imagp");
3753 Builder.CreateStore(SrcReal, DestRealPtr);
3754 Builder.CreateStore(SrcImg, DestImgPtr);
3755 break;
3756 }
3757 case EvalKind::Aggregate: {
3758 Value *SizeVal =
3759 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3760 Builder.CreateMemCpy(
3761 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3762 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3763 break;
3764 }
3765 }
3766 }
3767
3768 Builder.CreateRetVoid();
3769 Builder.restoreIP(OldIP);
3770 return LtGCFunc;
3771}
3772
3773Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3774 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3775 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3776 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3777 LLVMContext &Ctx = M.getContext();
3778 FunctionType *FuncTy = FunctionType::get(
3779 Builder.getVoidTy(),
3780 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3781 /* IsVarArg */ false);
3782 Function *LtGRFunc =
3784 "_omp_reduction_list_to_global_reduce_func", &M);
3785 LtGRFunc->setAttributes(FuncAttrs);
3786 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3787 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3788 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3789
3790 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3791 Builder.SetInsertPoint(EntryBlock);
3792
3793 // Buffer: global reduction buffer.
3794 Argument *BufferArg = LtGRFunc->getArg(0);
3795 // Idx: index of the buffer.
3796 Argument *IdxArg = LtGRFunc->getArg(1);
3797 // ReduceList: thread local Reduce list.
3798 Argument *ReduceListArg = LtGRFunc->getArg(2);
3799
3800 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3801 BufferArg->getName() + ".addr");
3802 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3803 IdxArg->getName() + ".addr");
3804 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3805 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3806 auto *RedListArrayTy =
3807 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3808
3809 // 1. Build a list of reduction variables.
3810 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3811 Value *LocalReduceList =
3812 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3813
3814 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3815
3816 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3817 BufferArgAlloca, Builder.getPtrTy(),
3818 BufferArgAlloca->getName() + ".ascast");
3819 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3820 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3821 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3822 ReduceListArgAlloca, Builder.getPtrTy(),
3823 ReduceListArgAlloca->getName() + ".ascast");
3824 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3825 LocalReduceList, Builder.getPtrTy(),
3826 LocalReduceList->getName() + ".ascast");
3827
3828 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3829 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3830 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3831
3832 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3833 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3834 Type *IndexTy = Builder.getIndexTy(
3835 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3836 for (auto En : enumerate(ReductionInfos)) {
3837 const ReductionInfo &RI = En.value();
3838 Value *ByRefAlloc;
3839
3840 if (!IsByRef.empty() && IsByRef[En.index()]) {
3841 InsertPointTy OldIP = Builder.saveIP();
3842 Builder.restoreIP(AllocaIP);
3843
3844 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3845 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3846 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3847
3848 Builder.restoreIP(OldIP);
3849 }
3850
3851 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3852 RedListArrayTy, LocalReduceListAddrCast,
3853 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3854 Value *BufferVD =
3855 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3856 // Global = Buffer.VD[Idx];
3857 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3858 ReductionsBufferTy, BufferVD, 0, En.index());
3859
3860 if (!IsByRef.empty() && IsByRef[En.index()]) {
3861 // Get source descriptor from the reduce list argument
3862 Value *ReduceList =
3863 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3864 Value *SrcElementPtrPtr =
3865 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3866 {ConstantInt::get(IndexTy, 0),
3867 ConstantInt::get(IndexTy, En.index())});
3868 Value *SrcDescriptorAddr =
3869 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3870
3871 // Copy descriptor from source and update base_ptr to global buffer data
3872 InsertPointOrErrorTy GenResult =
3873 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3874 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3875
3876 if (!GenResult)
3877 return GenResult.takeError();
3878
3879 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3880 } else {
3881 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3882 }
3883 }
3884
3885 // Call reduce_function(GlobalReduceList, ReduceList)
3886 Value *ReduceList =
3887 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3888 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3889 ->addFnAttr(Attribute::NoUnwind);
3890 Builder.CreateRetVoid();
3891 Builder.restoreIP(OldIP);
3892 return LtGRFunc;
3893}
3894
3895Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3896 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3897 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3898 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3899 LLVMContext &Ctx = M.getContext();
3900 FunctionType *FuncTy = FunctionType::get(
3901 Builder.getVoidTy(),
3902 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3903 /* IsVarArg */ false);
3904 Function *GtLCFunc =
3906 "_omp_reduction_global_to_list_copy_func", &M);
3907 GtLCFunc->setAttributes(FuncAttrs);
3908 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3909 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3910 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3911
3912 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3913 Builder.SetInsertPoint(EntryBlock);
3914
3915 // Buffer: global reduction buffer.
3916 Argument *BufferArg = GtLCFunc->getArg(0);
3917 // Idx: index of the buffer.
3918 Argument *IdxArg = GtLCFunc->getArg(1);
3919 // ReduceList: thread local Reduce list.
3920 Argument *ReduceListArg = GtLCFunc->getArg(2);
3921
3922 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3923 BufferArg->getName() + ".addr");
3924 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3925 IdxArg->getName() + ".addr");
3926 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3927 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3928 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3929 BufferArgAlloca, Builder.getPtrTy(),
3930 BufferArgAlloca->getName() + ".ascast");
3931 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3932 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3933 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3934 ReduceListArgAlloca, Builder.getPtrTy(),
3935 ReduceListArgAlloca->getName() + ".ascast");
3936 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3937 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3938 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3939
3940 Value *LocalReduceList =
3941 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3942 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3943 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3944 Type *IndexTy = Builder.getIndexTy(
3945 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3946 for (auto En : enumerate(ReductionInfos)) {
3947 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3948 auto *RedListArrayTy =
3949 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3950 // Reduce element = LocalReduceList[i]
3951 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3952 RedListArrayTy, LocalReduceList,
3953 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3954 // elemptr = ((CopyType*)(elemptrptr)) + I
3955 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3956 // Global = Buffer.VD[Idx];
3957 Value *BufferVD =
3958 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3959 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3960 ReductionsBufferTy, BufferVD, 0, En.index());
3961
3962 switch (RI.EvaluationKind) {
3963 case EvalKind::Scalar: {
3964 Type *ElemType = RI.ElementType;
3965
3966 if (!IsByRef.empty() && IsByRef[En.index()]) {
3967 ElemType = RI.ByRefElementType;
3968 InsertPointOrErrorTy GenResult =
3969 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3970
3971 if (!GenResult)
3972 return GenResult.takeError();
3973
3974 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3975 }
3976
3977 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3978 Builder.CreateStore(TargetElement, ElemPtr);
3979 break;
3980 }
3981 case EvalKind::Complex: {
3982 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3983 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3984 Value *SrcReal = Builder.CreateLoad(
3985 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3986 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3987 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3988 Value *SrcImg = Builder.CreateLoad(
3989 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3990
3991 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3992 RI.ElementType, ElemPtr, 0, 0, ".realp");
3993 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3994 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3995 Builder.CreateStore(SrcReal, DestRealPtr);
3996 Builder.CreateStore(SrcImg, DestImgPtr);
3997 break;
3998 }
3999 case EvalKind::Aggregate: {
4000 Value *SizeVal =
4001 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4002 Builder.CreateMemCpy(
4003 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4004 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4005 SizeVal, false);
4006 break;
4007 }
4008 }
4009 }
4010
4011 Builder.CreateRetVoid();
4012 Builder.restoreIP(OldIP);
4013 return GtLCFunc;
4014}
4015
4016Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4017 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4018 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4019 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4020 LLVMContext &Ctx = M.getContext();
4021 auto *FuncTy = FunctionType::get(
4022 Builder.getVoidTy(),
4023 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4024 /* IsVarArg */ false);
4025 Function *GtLRFunc =
4027 "_omp_reduction_global_to_list_reduce_func", &M);
4028 GtLRFunc->setAttributes(FuncAttrs);
4029 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4030 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4031 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4032
4033 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4034 Builder.SetInsertPoint(EntryBlock);
4035
4036 // Buffer: global reduction buffer.
4037 Argument *BufferArg = GtLRFunc->getArg(0);
4038 // Idx: index of the buffer.
4039 Argument *IdxArg = GtLRFunc->getArg(1);
4040 // ReduceList: thread local Reduce list.
4041 Argument *ReduceListArg = GtLRFunc->getArg(2);
4042
4043 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4044 BufferArg->getName() + ".addr");
4045 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4046 IdxArg->getName() + ".addr");
4047 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4048 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4049 ArrayType *RedListArrayTy =
4050 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4051
4052 // 1. Build a list of reduction variables.
4053 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4054 Value *LocalReduceList =
4055 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4056
4057 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4058
4059 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4060 BufferArgAlloca, Builder.getPtrTy(),
4061 BufferArgAlloca->getName() + ".ascast");
4062 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4063 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4064 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4065 ReduceListArgAlloca, Builder.getPtrTy(),
4066 ReduceListArgAlloca->getName() + ".ascast");
4067 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4068 LocalReduceList, Builder.getPtrTy(),
4069 LocalReduceList->getName() + ".ascast");
4070
4071 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4072 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4073 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4074
4075 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4076 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4077 Type *IndexTy = Builder.getIndexTy(
4078 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4079 for (auto En : enumerate(ReductionInfos)) {
4080 const ReductionInfo &RI = En.value();
4081 Value *ByRefAlloc;
4082
4083 if (!IsByRef.empty() && IsByRef[En.index()]) {
4084 InsertPointTy OldIP = Builder.saveIP();
4085 Builder.restoreIP(AllocaIP);
4086
4087 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4088 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4089 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4090
4091 Builder.restoreIP(OldIP);
4092 }
4093
4094 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4095 RedListArrayTy, ReductionList,
4096 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4097 // Global = Buffer.VD[Idx];
4098 Value *BufferVD =
4099 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4100 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4101 ReductionsBufferTy, BufferVD, 0, En.index());
4102
4103 if (!IsByRef.empty() && IsByRef[En.index()]) {
4104 // Get source descriptor from the reduce list
4105 Value *ReduceListVal =
4106 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4107 Value *SrcElementPtrPtr =
4108 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4109 {ConstantInt::get(IndexTy, 0),
4110 ConstantInt::get(IndexTy, En.index())});
4111 Value *SrcDescriptorAddr =
4112 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4113
4114 // Copy descriptor from source and update base_ptr to global buffer data
4115 InsertPointOrErrorTy GenResult =
4116 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4117 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4118 if (!GenResult)
4119 return GenResult.takeError();
4120
4121 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4122 } else {
4123 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4124 }
4125 }
4126
4127 // Call reduce_function(ReduceList, GlobalReduceList)
4128 Value *ReduceList =
4129 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4130 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4131 ->addFnAttr(Attribute::NoUnwind);
4132 Builder.CreateRetVoid();
4133 Builder.restoreIP(OldIP);
4134 return GtLRFunc;
4135}
4136
4137std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4138 std::string Suffix =
4139 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4140 return (Name + Suffix).str();
4141}
4142
4143Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4144 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4146 AttributeList FuncAttrs) {
4147 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4148 {Builder.getPtrTy(), Builder.getPtrTy()},
4149 /* IsVarArg */ false);
4150 std::string Name = getReductionFuncName(ReducerName);
4151 Function *ReductionFunc =
4153 ReductionFunc->setAttributes(FuncAttrs);
4154 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4155 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4156 BasicBlock *EntryBB =
4157 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4158 Builder.SetInsertPoint(EntryBB);
4159
4160 // Need to alloca memory here and deal with the pointers before getting
4161 // LHS/RHS pointers out
4162 Value *LHSArrayPtr = nullptr;
4163 Value *RHSArrayPtr = nullptr;
4164 Argument *Arg0 = ReductionFunc->getArg(0);
4165 Argument *Arg1 = ReductionFunc->getArg(1);
4166 Type *Arg0Type = Arg0->getType();
4167 Type *Arg1Type = Arg1->getType();
4168
4169 Value *LHSAlloca =
4170 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4171 Value *RHSAlloca =
4172 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4173 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4174 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4175 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4176 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4177 Builder.CreateStore(Arg0, LHSAddrCast);
4178 Builder.CreateStore(Arg1, RHSAddrCast);
4179 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4180 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4181
4182 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4183 Type *IndexTy = Builder.getIndexTy(
4184 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4185 SmallVector<Value *> LHSPtrs, RHSPtrs;
4186 for (auto En : enumerate(ReductionInfos)) {
4187 const ReductionInfo &RI = En.value();
4188 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4189 RedArrayTy, RHSArrayPtr,
4190 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4191 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4192 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4193 RHSI8Ptr, RI.PrivateVariable->getType(),
4194 RHSI8Ptr->getName() + ".ascast");
4195
4196 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4197 RedArrayTy, LHSArrayPtr,
4198 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4199 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4200 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4201 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4202
4204 LHSPtrs.emplace_back(LHSPtr);
4205 RHSPtrs.emplace_back(RHSPtr);
4206 } else {
4207 Value *LHS = LHSPtr;
4208 Value *RHS = RHSPtr;
4209
4210 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4211 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4212 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4213 }
4214
4215 Value *Reduced;
4216 InsertPointOrErrorTy AfterIP =
4217 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4218 if (!AfterIP)
4219 return AfterIP.takeError();
4220 if (!Builder.GetInsertBlock())
4221 return ReductionFunc;
4222
4223 Builder.restoreIP(*AfterIP);
4224
4225 if (!IsByRef.empty() && !IsByRef[En.index()])
4226 Builder.CreateStore(Reduced, LHSPtr);
4227 }
4228 }
4229
4231 for (auto En : enumerate(ReductionInfos)) {
4232 unsigned Index = En.index();
4233 const ReductionInfo &RI = En.value();
4234 Value *LHSFixupPtr, *RHSFixupPtr;
4235 Builder.restoreIP(RI.ReductionGenClang(
4236 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4237
4238 // Fix the CallBack code genereated to use the correct Values for the LHS
4239 // and RHS
4240 LHSFixupPtr->replaceUsesWithIf(
4241 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4242 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4243 ReductionFunc;
4244 });
4245 RHSFixupPtr->replaceUsesWithIf(
4246 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4247 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4248 ReductionFunc;
4249 });
4250 }
4251
4252 Builder.CreateRetVoid();
4253 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4254 // to the entry block (this is dones for higher opt levels by later passes in
4255 // the pipeline). This has caused issues because non-entry `alloca`s force the
4256 // function to use dynamic stack allocations and we might run out of scratch
4257 // memory.
4258 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4259
4260 return ReductionFunc;
4261}
4262
4263static void
4265 bool IsGPU) {
4266 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4267 (void)RI;
4268 assert(RI.Variable && "expected non-null variable");
4269 assert(RI.PrivateVariable && "expected non-null private variable");
4270 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4271 "expected non-null reduction generator callback");
4272 if (!IsGPU) {
4273 assert(
4274 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4275 "expected variables and their private equivalents to have the same "
4276 "type");
4277 }
4278 assert(RI.Variable->getType()->isPointerTy() &&
4279 "expected variables to be pointers");
4280 }
4281}
4282
4284 const LocationDescription &Loc, InsertPointTy AllocaIP,
4285 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4286 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4287 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4288 unsigned ReductionBufNum, Value *SrcLocInfo) {
4289 if (!updateToLocation(Loc))
4290 return InsertPointTy();
4291 Builder.restoreIP(CodeGenIP);
4292 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4293 LLVMContext &Ctx = M.getContext();
4294
4295 // Source location for the ident struct
4296 if (!SrcLocInfo) {
4297 uint32_t SrcLocStrSize;
4298 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4299 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4300 }
4301
4302 if (ReductionInfos.size() == 0)
4303 return Builder.saveIP();
4304
4305 BasicBlock *ContinuationBlock = nullptr;
4307 // Copied code from createReductions
4308 BasicBlock *InsertBlock = Loc.IP.getBlock();
4309 ContinuationBlock =
4310 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4311 InsertBlock->getTerminator()->eraseFromParent();
4312 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4313 }
4314
4315 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4316 AttributeList FuncAttrs;
4317 AttrBuilder AttrBldr(Ctx);
4318 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4319 AttrBldr.addAttribute(Attr);
4320 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4321 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4322
4323 CodeGenIP = Builder.saveIP();
4324 Expected<Function *> ReductionResult = createReductionFunction(
4325 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4326 ReductionGenCBKind, FuncAttrs);
4327 if (!ReductionResult)
4328 return ReductionResult.takeError();
4329 Function *ReductionFunc = *ReductionResult;
4330 Builder.restoreIP(CodeGenIP);
4331
4332 // Set the grid value in the config needed for lowering later on
4333 if (GridValue.has_value())
4334 Config.setGridValue(GridValue.value());
4335 else
4336 Config.setGridValue(getGridValue(T, ReductionFunc));
4337
4338 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4339 // RedList, shuffle_reduce_func, interwarp_copy_func);
4340 // or
4341 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4342 Value *Res;
4343
4344 // 1. Build a list of reduction variables.
4345 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4346 auto Size = ReductionInfos.size();
4347 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4348 Type *FuncPtrTy =
4349 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4350 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4351 CodeGenIP = Builder.saveIP();
4352 Builder.restoreIP(AllocaIP);
4353 Value *ReductionListAlloca =
4354 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4355 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4356 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4357 Builder.restoreIP(CodeGenIP);
4358 Type *IndexTy = Builder.getIndexTy(
4359 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4360 for (auto En : enumerate(ReductionInfos)) {
4361 const ReductionInfo &RI = En.value();
4362 Value *ElemPtr = Builder.CreateInBoundsGEP(
4363 RedArrayTy, ReductionList,
4364 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4365
4366 Value *PrivateVar = RI.PrivateVariable;
4367 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4368 if (IsByRefElem)
4369 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4370
4371 Value *CastElem =
4372 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4373 Builder.CreateStore(CastElem, ElemPtr);
4374 }
4375 CodeGenIP = Builder.saveIP();
4376 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4377 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4378
4379 if (!SarFunc)
4380 return SarFunc.takeError();
4381
4382 Expected<Function *> CopyResult =
4383 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4384 if (!CopyResult)
4385 return CopyResult.takeError();
4386 Function *WcFunc = *CopyResult;
4387 Builder.restoreIP(CodeGenIP);
4388
4389 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4390
4391 unsigned MaxDataSize = 0;
4392 SmallVector<Type *> ReductionTypeArgs;
4393 for (auto En : enumerate(ReductionInfos)) {
4394 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
4395 if (Size > MaxDataSize)
4396 MaxDataSize = Size;
4397 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4398 ? En.value().ByRefElementType
4399 : En.value().ElementType;
4400 ReductionTypeArgs.emplace_back(RedTypeArg);
4401 }
4402 Value *ReductionDataSize =
4403 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4404 if (!IsTeamsReduction) {
4405 Value *SarFuncCast =
4406 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4407 Value *WcFuncCast =
4408 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4409 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4410 WcFuncCast};
4412 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4413 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4414 } else {
4415 CodeGenIP = Builder.saveIP();
4416 StructType *ReductionsBufferTy = StructType::create(
4417 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4418 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4419 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4420
4421 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4422 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4423 if (!LtGCFunc)
4424 return LtGCFunc.takeError();
4425
4426 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4427 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4428 if (!LtGRFunc)
4429 return LtGRFunc.takeError();
4430
4431 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4432 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4433 if (!GtLCFunc)
4434 return GtLCFunc.takeError();
4435
4436 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4437 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4438 if (!GtLRFunc)
4439 return GtLRFunc.takeError();
4440
4441 Builder.restoreIP(CodeGenIP);
4442
4443 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4444 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4445
4446 Value *Args3[] = {SrcLocInfo,
4447 KernelTeamsReductionPtr,
4448 Builder.getInt32(ReductionBufNum),
4449 ReductionDataSize,
4450 RL,
4451 *SarFunc,
4452 WcFunc,
4453 *LtGCFunc,
4454 *LtGRFunc,
4455 *GtLCFunc,
4456 *GtLRFunc};
4457
4458 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4459 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4460 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4461 }
4462
4463 // 5. Build if (res == 1)
4464 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4465 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4466 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4467 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4468
4469 // 6. Build then branch: where we have reduced values in the master
4470 // thread in each team.
4471 // __kmpc_end_reduce{_nowait}(<gtid>);
4472 // break;
4473 emitBlock(ThenBB, CurFunc);
4474
4475 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4476 for (auto En : enumerate(ReductionInfos)) {
4477 const ReductionInfo &RI = En.value();
4479 Value *RedValue = RI.Variable;
4480 Value *RHS =
4481 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4482
4484 Value *LHSPtr, *RHSPtr;
4485 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4486 &LHSPtr, &RHSPtr, CurFunc));
4487
4488 // Fix the CallBack code genereated to use the correct Values for the LHS
4489 // and RHS
4490 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4491 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4492 ReductionFunc;
4493 });
4494 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4495 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4496 ReductionFunc;
4497 });
4498 } else {
4499 if (IsByRef.empty() || !IsByRef[En.index()]) {
4500 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4501 "red.value." + Twine(En.index()));
4502 }
4503 Value *PrivateRedValue = Builder.CreateLoad(
4504 ValueType, RHS, "red.private.value" + Twine(En.index()));
4505 Value *Reduced;
4506 InsertPointOrErrorTy AfterIP =
4507 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4508 if (!AfterIP)
4509 return AfterIP.takeError();
4510 Builder.restoreIP(*AfterIP);
4511
4512 if (!IsByRef.empty() && !IsByRef[En.index()])
4513 Builder.CreateStore(Reduced, RI.Variable);
4514 }
4515 }
4516 emitBlock(ExitBB, CurFunc);
4517 if (ContinuationBlock) {
4518 Builder.CreateBr(ContinuationBlock);
4519 Builder.SetInsertPoint(ContinuationBlock);
4520 }
4521 Config.setEmitLLVMUsed();
4522
4523 return Builder.saveIP();
4524}
4525
4527 Type *VoidTy = Type::getVoidTy(M.getContext());
4528 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4529 auto *FuncTy =
4530 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4532 ".omp.reduction.func", &M);
4533}
4534
4536 Function *ReductionFunc,
4538 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4539 Module *Module = ReductionFunc->getParent();
4540 BasicBlock *ReductionFuncBlock =
4541 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4542 Builder.SetInsertPoint(ReductionFuncBlock);
4543 Value *LHSArrayPtr = nullptr;
4544 Value *RHSArrayPtr = nullptr;
4545 if (IsGPU) {
4546 // Need to alloca memory here and deal with the pointers before getting
4547 // LHS/RHS pointers out
4548 //
4549 Argument *Arg0 = ReductionFunc->getArg(0);
4550 Argument *Arg1 = ReductionFunc->getArg(1);
4551 Type *Arg0Type = Arg0->getType();
4552 Type *Arg1Type = Arg1->getType();
4553
4554 Value *LHSAlloca =
4555 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4556 Value *RHSAlloca =
4557 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4558 Value *LHSAddrCast =
4559 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4560 Value *RHSAddrCast =
4561 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4562 Builder.CreateStore(Arg0, LHSAddrCast);
4563 Builder.CreateStore(Arg1, RHSAddrCast);
4564 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4565 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4566 } else {
4567 LHSArrayPtr = ReductionFunc->getArg(0);
4568 RHSArrayPtr = ReductionFunc->getArg(1);
4569 }
4570
4571 unsigned NumReductions = ReductionInfos.size();
4572 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4573
4574 for (auto En : enumerate(ReductionInfos)) {
4575 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4576 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4577 RedArrayTy, LHSArrayPtr, 0, En.index());
4578 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4579 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4580 LHSI8Ptr, RI.Variable->getType());
4581 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4582 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4583 RedArrayTy, RHSArrayPtr, 0, En.index());
4584 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4585 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4586 RHSI8Ptr, RI.PrivateVariable->getType());
4587 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4588 Value *Reduced;
4590 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4591 if (!AfterIP)
4592 return AfterIP.takeError();
4593
4594 Builder.restoreIP(*AfterIP);
4595 // TODO: Consider flagging an error.
4596 if (!Builder.GetInsertBlock())
4597 return Error::success();
4598
4599 // store is inside of the reduction region when using by-ref
4600 if (!IsByRef[En.index()])
4601 Builder.CreateStore(Reduced, LHSPtr);
4602 }
4603 Builder.CreateRetVoid();
4604 return Error::success();
4605}
4606
4608 const LocationDescription &Loc, InsertPointTy AllocaIP,
4609 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4610 bool IsNoWait, bool IsTeamsReduction) {
4611 assert(ReductionInfos.size() == IsByRef.size());
4612 if (Config.isGPU())
4613 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4614 IsByRef, IsNoWait, IsTeamsReduction);
4615
4616 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4617
4618 if (!updateToLocation(Loc))
4619 return InsertPointTy();
4620
4621 if (ReductionInfos.size() == 0)
4622 return Builder.saveIP();
4623
4624 BasicBlock *InsertBlock = Loc.IP.getBlock();
4625 BasicBlock *ContinuationBlock =
4626 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4627 InsertBlock->getTerminator()->eraseFromParent();
4628
4629 // Create and populate array of type-erased pointers to private reduction
4630 // values.
4631 unsigned NumReductions = ReductionInfos.size();
4632 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4633 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4634 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4635
4636 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4637
4638 for (auto En : enumerate(ReductionInfos)) {
4639 unsigned Index = En.index();
4640 const ReductionInfo &RI = En.value();
4641 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4642 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4643 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4644 }
4645
4646 // Emit a call to the runtime function that orchestrates the reduction.
4647 // Declare the reduction function in the process.
4648 Type *IndexTy = Builder.getIndexTy(
4649 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4650 Function *Func = Builder.GetInsertBlock()->getParent();
4651 Module *Module = Func->getParent();
4652 uint32_t SrcLocStrSize;
4653 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4654 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4655 return RI.AtomicReductionGen;
4656 });
4657 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4658 CanGenerateAtomic
4659 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4660 : IdentFlag(0));
4661 Value *ThreadId = getOrCreateThreadID(Ident);
4662 Constant *NumVariables = Builder.getInt32(NumReductions);
4663 const DataLayout &DL = Module->getDataLayout();
4664 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4665 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4666 Function *ReductionFunc = getFreshReductionFunc(*Module);
4667 Value *Lock = getOMPCriticalRegionLock(".reduction");
4669 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4670 : RuntimeFunction::OMPRTL___kmpc_reduce);
4671 CallInst *ReduceCall =
4672 createRuntimeFunctionCall(ReduceFunc,
4673 {Ident, ThreadId, NumVariables, RedArraySize,
4674 RedArray, ReductionFunc, Lock},
4675 "reduce");
4676
4677 // Create final reduction entry blocks for the atomic and non-atomic case.
4678 // Emit IR that dispatches control flow to one of the blocks based on the
4679 // reduction supporting the atomic mode.
4680 BasicBlock *NonAtomicRedBlock =
4681 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4682 BasicBlock *AtomicRedBlock =
4683 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4684 SwitchInst *Switch =
4685 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4686 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4687 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4688
4689 // Populate the non-atomic reduction using the elementwise reduction function.
4690 // This loads the elements from the global and private variables and reduces
4691 // them before storing back the result to the global variable.
4692 Builder.SetInsertPoint(NonAtomicRedBlock);
4693 for (auto En : enumerate(ReductionInfos)) {
4694 const ReductionInfo &RI = En.value();
4696 // We have one less load for by-ref case because that load is now inside of
4697 // the reduction region
4698 Value *RedValue = RI.Variable;
4699 if (!IsByRef[En.index()]) {
4700 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4701 "red.value." + Twine(En.index()));
4702 }
4703 Value *PrivateRedValue =
4704 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4705 "red.private.value." + Twine(En.index()));
4706 Value *Reduced;
4707 InsertPointOrErrorTy AfterIP =
4708 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4709 if (!AfterIP)
4710 return AfterIP.takeError();
4711 Builder.restoreIP(*AfterIP);
4712
4713 if (!Builder.GetInsertBlock())
4714 return InsertPointTy();
4715 // for by-ref case, the load is inside of the reduction region
4716 if (!IsByRef[En.index()])
4717 Builder.CreateStore(Reduced, RI.Variable);
4718 }
4719 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4720 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4721 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4722 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4723 Builder.CreateBr(ContinuationBlock);
4724
4725 // Populate the atomic reduction using the atomic elementwise reduction
4726 // function. There are no loads/stores here because they will be happening
4727 // inside the atomic elementwise reduction.
4728 Builder.SetInsertPoint(AtomicRedBlock);
4729 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4730 for (const ReductionInfo &RI : ReductionInfos) {
4732 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4733 if (!AfterIP)
4734 return AfterIP.takeError();
4735 Builder.restoreIP(*AfterIP);
4736 if (!Builder.GetInsertBlock())
4737 return InsertPointTy();
4738 }
4739 Builder.CreateBr(ContinuationBlock);
4740 } else {
4741 Builder.CreateUnreachable();
4742 }
4743
4744 // Populate the outlined reduction function using the elementwise reduction
4745 // function. Partial values are extracted from the type-erased array of
4746 // pointers to private variables.
4747 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4748 IsByRef, /*isGPU=*/false);
4749 if (Err)
4750 return Err;
4751
4752 if (!Builder.GetInsertBlock())
4753 return InsertPointTy();
4754
4755 Builder.SetInsertPoint(ContinuationBlock);
4756 return Builder.saveIP();
4757}
4758
4761 BodyGenCallbackTy BodyGenCB,
4762 FinalizeCallbackTy FiniCB) {
4763 if (!updateToLocation(Loc))
4764 return Loc.IP;
4765
4766 Directive OMPD = Directive::OMPD_master;
4767 uint32_t SrcLocStrSize;
4768 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4769 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4770 Value *ThreadId = getOrCreateThreadID(Ident);
4771 Value *Args[] = {Ident, ThreadId};
4772
4773 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4774 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4775
4776 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4777 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4778
4779 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4780 /*Conditional*/ true, /*hasFinalize*/ true);
4781}
4782
4785 BodyGenCallbackTy BodyGenCB,
4786 FinalizeCallbackTy FiniCB, Value *Filter) {
4787 if (!updateToLocation(Loc))
4788 return Loc.IP;
4789
4790 Directive OMPD = Directive::OMPD_masked;
4791 uint32_t SrcLocStrSize;
4792 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4793 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4794 Value *ThreadId = getOrCreateThreadID(Ident);
4795 Value *Args[] = {Ident, ThreadId, Filter};
4796 Value *ArgsEnd[] = {Ident, ThreadId};
4797
4798 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4799 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4800
4801 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4802 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4803
4804 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4805 /*Conditional*/ true, /*hasFinalize*/ true);
4806}
4807
4809 llvm::FunctionCallee Callee,
4811 const llvm::Twine &Name) {
4812 llvm::CallInst *Call = Builder.CreateCall(
4813 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4814 Call->setDoesNotThrow();
4815 return Call;
4816}
4817
4818// Expects input basic block is dominated by BeforeScanBB.
4819// Once Scan directive is encountered, the code after scan directive should be
4820// dominated by AfterScanBB. Scan directive splits the code sequence to
4821// scan and input phase. Based on whether inclusive or exclusive
4822// clause is used in the scan directive and whether input loop or scan loop
4823// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4824// input loop and second is the scan loop. The code generated handles only
4825// inclusive scans now.
4827 const LocationDescription &Loc, InsertPointTy AllocaIP,
4828 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4829 bool IsInclusive, ScanInfo *ScanRedInfo) {
4830 if (ScanRedInfo->OMPFirstScanLoop) {
4831 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4832 ScanVarsType, ScanRedInfo);
4833 if (Err)
4834 return Err;
4835 }
4836 if (!updateToLocation(Loc))
4837 return Loc.IP;
4838
4839 llvm::Value *IV = ScanRedInfo->IV;
4840
4841 if (ScanRedInfo->OMPFirstScanLoop) {
4842 // Emit buffer[i] = red; at the end of the input phase.
4843 for (size_t i = 0; i < ScanVars.size(); i++) {
4844 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4845 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4846 Type *DestTy = ScanVarsType[i];
4847 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4848 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4849
4850 Builder.CreateStore(Src, Val);
4851 }
4852 }
4853 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4854 emitBlock(ScanRedInfo->OMPScanDispatch,
4855 Builder.GetInsertBlock()->getParent());
4856
4857 if (!ScanRedInfo->OMPFirstScanLoop) {
4858 IV = ScanRedInfo->IV;
4859 // Emit red = buffer[i]; at the entrance to the scan phase.
4860 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4861 for (size_t i = 0; i < ScanVars.size(); i++) {
4862 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4863 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4864 Type *DestTy = ScanVarsType[i];
4865 Value *SrcPtr =
4866 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4867 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4868 Builder.CreateStore(Src, ScanVars[i]);
4869 }
4870 }
4871
4872 // TODO: Update it to CreateBr and remove dead blocks
4873 llvm::Value *CmpI = Builder.getInt1(true);
4874 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4875 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4876 ScanRedInfo->OMPAfterScanBlock);
4877 } else {
4878 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4879 ScanRedInfo->OMPBeforeScanBlock);
4880 }
4881 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4882 Builder.GetInsertBlock()->getParent());
4883 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4884 return Builder.saveIP();
4885}
4886
4887Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4888 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4889 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4890
4891 Builder.restoreIP(AllocaIP);
4892 // Create the shared pointer at alloca IP.
4893 for (size_t i = 0; i < ScanVars.size(); i++) {
4894 llvm::Value *BuffPtr =
4895 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4896 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4897 }
4898
4899 // Allocate temporary buffer by master thread
4900 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4901 InsertPointTy CodeGenIP) -> Error {
4902 Builder.restoreIP(CodeGenIP);
4903 Value *AllocSpan =
4904 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4905 for (size_t i = 0; i < ScanVars.size(); i++) {
4906 Type *IntPtrTy = Builder.getInt32Ty();
4907 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4908 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4909 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4910 AllocSpan, nullptr, "arr");
4911 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4912 }
4913 return Error::success();
4914 };
4915 // TODO: Perform finalization actions for variables. This has to be
4916 // called for variables which have destructors/finalizers.
4917 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4918
4919 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4920 llvm::Value *FilterVal = Builder.getInt32(0);
4922 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4923
4924 if (!AfterIP)
4925 return AfterIP.takeError();
4926 Builder.restoreIP(*AfterIP);
4927 BasicBlock *InputBB = Builder.GetInsertBlock();
4928 if (InputBB->getTerminator())
4929 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4930 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4931 if (!AfterIP)
4932 return AfterIP.takeError();
4933 Builder.restoreIP(*AfterIP);
4934
4935 return Error::success();
4936}
4937
4938Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4939 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4940 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4941 InsertPointTy CodeGenIP) -> Error {
4942 Builder.restoreIP(CodeGenIP);
4943 for (ReductionInfo RedInfo : ReductionInfos) {
4944 Value *PrivateVar = RedInfo.PrivateVariable;
4945 Value *OrigVar = RedInfo.Variable;
4946 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4947 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4948
4949 Type *SrcTy = RedInfo.ElementType;
4950 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4951 "arrayOffset");
4952 Value *Src = Builder.CreateLoad(SrcTy, Val);
4953
4954 Builder.CreateStore(Src, OrigVar);
4955 Builder.CreateFree(Buff);
4956 }
4957 return Error::success();
4958 };
4959 // TODO: Perform finalization actions for variables. This has to be
4960 // called for variables which have destructors/finalizers.
4961 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4962
4963 if (ScanRedInfo->OMPScanFinish->getTerminator())
4964 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4965 else
4966 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4967
4968 llvm::Value *FilterVal = Builder.getInt32(0);
4970 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4971
4972 if (!AfterIP)
4973 return AfterIP.takeError();
4974 Builder.restoreIP(*AfterIP);
4975 BasicBlock *InputBB = Builder.GetInsertBlock();
4976 if (InputBB->getTerminator())
4977 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4978 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4979 if (!AfterIP)
4980 return AfterIP.takeError();
4981 Builder.restoreIP(*AfterIP);
4982 return Error::success();
4983}
4984
4986 const LocationDescription &Loc,
4988 ScanInfo *ScanRedInfo) {
4989
4990 if (!updateToLocation(Loc))
4991 return Loc.IP;
4992 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4993 InsertPointTy CodeGenIP) -> Error {
4994 Builder.restoreIP(CodeGenIP);
4995 Function *CurFn = Builder.GetInsertBlock()->getParent();
4996 // for (int k = 0; k <= ceil(log2(n)); ++k)
4997 llvm::BasicBlock *LoopBB =
4998 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4999 llvm::BasicBlock *ExitBB =
5000 splitBB(Builder, false, "omp.outer.log.scan.exit");
5002 Builder.GetInsertBlock()->getModule(),
5003 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5004 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5005 llvm::Value *Arg =
5006 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5007 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5009 Builder.GetInsertBlock()->getModule(),
5010 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5011 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5012 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5013 llvm::Value *NMin1 = Builder.CreateNUWSub(
5014 ScanRedInfo->Span,
5015 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5016 Builder.SetInsertPoint(InputBB);
5017 Builder.CreateBr(LoopBB);
5018 emitBlock(LoopBB, CurFn);
5019 Builder.SetInsertPoint(LoopBB);
5020
5021 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5022 // size pow2k = 1;
5023 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5024 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5025 InputBB);
5026 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5027 InputBB);
5028 // for (size i = n - 1; i >= 2 ^ k; --i)
5029 // tmp[i] op= tmp[i-pow2k];
5030 llvm::BasicBlock *InnerLoopBB =
5031 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5032 llvm::BasicBlock *InnerExitBB =
5033 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5034 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5035 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5036 emitBlock(InnerLoopBB, CurFn);
5037 Builder.SetInsertPoint(InnerLoopBB);
5038 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5039 IVal->addIncoming(NMin1, LoopBB);
5040 for (ReductionInfo RedInfo : ReductionInfos) {
5041 Value *ReductionVal = RedInfo.PrivateVariable;
5042 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5043 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5044 Type *DestTy = RedInfo.ElementType;
5045 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5046 Value *LHSPtr =
5047 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5048 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5049 Value *RHSPtr =
5050 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5051 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5052 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5053 llvm::Value *Result;
5054 InsertPointOrErrorTy AfterIP =
5055 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5056 if (!AfterIP)
5057 return AfterIP.takeError();
5058 Builder.CreateStore(Result, LHSPtr);
5059 }
5060 llvm::Value *NextIVal = Builder.CreateNUWSub(
5061 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5062 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5063 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5064 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5065 emitBlock(InnerExitBB, CurFn);
5066 llvm::Value *Next = Builder.CreateNUWAdd(
5067 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5068 Counter->addIncoming(Next, Builder.GetInsertBlock());
5069 // pow2k <<= 1;
5070 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5071 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5072 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5073 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5074 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5075 return Error::success();
5076 };
5077
5078 // TODO: Perform finalization actions for variables. This has to be
5079 // called for variables which have destructors/finalizers.
5080 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5081
5082 llvm::Value *FilterVal = Builder.getInt32(0);
5084 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5085
5086 if (!AfterIP)
5087 return AfterIP.takeError();
5088 Builder.restoreIP(*AfterIP);
5089 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5090
5091 if (!AfterIP)
5092 return AfterIP.takeError();
5093 Builder.restoreIP(*AfterIP);
5094 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5095 if (Err)
5096 return Err;
5097
5098 return AfterIP;
5099}
5100
5101Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5102 llvm::function_ref<Error()> InputLoopGen,
5103 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5104 ScanInfo *ScanRedInfo) {
5105
5106 {
5107 // Emit loop with input phase:
5108 // for (i: 0..<num_iters>) {
5109 // <input phase>;
5110 // buffer[i] = red;
5111 // }
5112 ScanRedInfo->OMPFirstScanLoop = true;
5113 Error Err = InputLoopGen();
5114 if (Err)
5115 return Err;
5116 }
5117 {
5118 // Emit loop with scan phase:
5119 // for (i: 0..<num_iters>) {
5120 // red = buffer[i];
5121 // <scan phase>;
5122 // }
5123 ScanRedInfo->OMPFirstScanLoop = false;
5124 Error Err = ScanLoopGen(Builder.saveIP());
5125 if (Err)
5126 return Err;
5127 }
5128 return Error::success();
5129}
5130
5131void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5132 Function *Fun = Builder.GetInsertBlock()->getParent();
5133 ScanRedInfo->OMPScanDispatch =
5134 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5135 ScanRedInfo->OMPAfterScanBlock =
5136 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5137 ScanRedInfo->OMPBeforeScanBlock =
5138 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5139 ScanRedInfo->OMPScanLoopExit =
5140 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5141}
5143 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5144 BasicBlock *PostInsertBefore, const Twine &Name) {
5145 Module *M = F->getParent();
5146 LLVMContext &Ctx = M->getContext();
5147 Type *IndVarTy = TripCount->getType();
5148
5149 // Create the basic block structure.
5150 BasicBlock *Preheader =
5151 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5152 BasicBlock *Header =
5153 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5154 BasicBlock *Cond =
5155 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5156 BasicBlock *Body =
5157 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5158 BasicBlock *Latch =
5159 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5160 BasicBlock *Exit =
5161 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5162 BasicBlock *After =
5163 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5164
5165 // Use specified DebugLoc for new instructions.
5166 Builder.SetCurrentDebugLocation(DL);
5167
5168 Builder.SetInsertPoint(Preheader);
5169 Builder.CreateBr(Header);
5170
5171 Builder.SetInsertPoint(Header);
5172 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5173 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5174 Builder.CreateBr(Cond);
5175
5176 Builder.SetInsertPoint(Cond);
5177 Value *Cmp =
5178 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5179 Builder.CreateCondBr(Cmp, Body, Exit);
5180
5181 Builder.SetInsertPoint(Body);
5182 Builder.CreateBr(Latch);
5183
5184 Builder.SetInsertPoint(Latch);
5185 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5186 "omp_" + Name + ".next", /*HasNUW=*/true);
5187 Builder.CreateBr(Header);
5188 IndVarPHI->addIncoming(Next, Latch);
5189
5190 Builder.SetInsertPoint(Exit);
5191 Builder.CreateBr(After);
5192
5193 // Remember and return the canonical control flow.
5194 LoopInfos.emplace_front();
5195 CanonicalLoopInfo *CL = &LoopInfos.front();
5196
5197 CL->Header = Header;
5198 CL->Cond = Cond;
5199 CL->Latch = Latch;
5200 CL->Exit = Exit;
5201
5202#ifndef NDEBUG
5203 CL->assertOK();
5204#endif
5205 return CL;
5206}
5207
5210 LoopBodyGenCallbackTy BodyGenCB,
5211 Value *TripCount, const Twine &Name) {
5212 BasicBlock *BB = Loc.IP.getBlock();
5213 BasicBlock *NextBB = BB->getNextNode();
5214
5215 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5216 NextBB, NextBB, Name);
5217 BasicBlock *After = CL->getAfter();
5218
5219 // If location is not set, don't connect the loop.
5220 if (updateToLocation(Loc)) {
5221 // Split the loop at the insertion point: Branch to the preheader and move
5222 // every following instruction to after the loop (the After BB). Also, the
5223 // new successor is the loop's after block.
5224 spliceBB(Builder, After, /*CreateBranch=*/false);
5225 Builder.CreateBr(CL->getPreheader());
5226 }
5227
5228 // Emit the body content. We do it after connecting the loop to the CFG to
5229 // avoid that the callback encounters degenerate BBs.
5230 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5231 return Err;
5232
5233#ifndef NDEBUG
5234 CL->assertOK();
5235#endif
5236 return CL;
5237}
5238
5240 ScanInfos.emplace_front();
5241 ScanInfo *Result = &ScanInfos.front();
5242 return Result;
5243}
5244
5248 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5249 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5250 LocationDescription ComputeLoc =
5251 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5252 updateToLocation(ComputeLoc);
5253
5255
5257 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5258 ScanRedInfo->Span = TripCount;
5259 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5260 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5261
5262 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5263 Builder.restoreIP(CodeGenIP);
5264 ScanRedInfo->IV = IV;
5265 createScanBBs(ScanRedInfo);
5266 BasicBlock *InputBlock = Builder.GetInsertBlock();
5267 Instruction *Terminator = InputBlock->getTerminator();
5268 assert(Terminator->getNumSuccessors() == 1);
5269 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5270 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5271 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5272 Builder.GetInsertBlock()->getParent());
5273 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5274 emitBlock(ScanRedInfo->OMPScanLoopExit,
5275 Builder.GetInsertBlock()->getParent());
5276 Builder.CreateBr(ContinueBlock);
5277 Builder.SetInsertPoint(
5278 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5279 return BodyGenCB(Builder.saveIP(), IV);
5280 };
5281
5282 const auto &&InputLoopGen = [&]() -> Error {
5284 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5285 ComputeIP, Name, true, ScanRedInfo);
5286 if (!LoopInfo)
5287 return LoopInfo.takeError();
5288 Result.push_back(*LoopInfo);
5289 Builder.restoreIP((*LoopInfo)->getAfterIP());
5290 return Error::success();
5291 };
5292 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5294 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5295 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5296 if (!LoopInfo)
5297 return LoopInfo.takeError();
5298 Result.push_back(*LoopInfo);
5299 Builder.restoreIP((*LoopInfo)->getAfterIP());
5300 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5301 return Error::success();
5302 };
5303 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5304 if (Err)
5305 return Err;
5306 return Result;
5307}
5308
5310 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5311 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5312
5313 // Consider the following difficulties (assuming 8-bit signed integers):
5314 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5315 // DO I = 1, 100, 50
5316 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5317 // DO I = 100, 0, -128
5318
5319 // Start, Stop and Step must be of the same integer type.
5320 auto *IndVarTy = cast<IntegerType>(Start->getType());
5321 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5322 assert(IndVarTy == Step->getType() && "Step type mismatch");
5323
5325
5326 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5327 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5328
5329 // Like Step, but always positive.
5330 Value *Incr = Step;
5331
5332 // Distance between Start and Stop; always positive.
5333 Value *Span;
5334
5335 // Condition whether there are no iterations are executed at all, e.g. because
5336 // UB < LB.
5337 Value *ZeroCmp;
5338
5339 if (IsSigned) {
5340 // Ensure that increment is positive. If not, negate and invert LB and UB.
5341 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5342 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5343 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5344 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5345 Span = Builder.CreateSub(UB, LB, "", false, true);
5346 ZeroCmp = Builder.CreateICmp(
5347 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5348 } else {
5349 Span = Builder.CreateSub(Stop, Start, "", true);
5350 ZeroCmp = Builder.CreateICmp(
5351 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5352 }
5353
5354 Value *CountIfLooping;
5355 if (InclusiveStop) {
5356 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5357 } else {
5358 // Avoid incrementing past stop since it could overflow.
5359 Value *CountIfTwo = Builder.CreateAdd(
5360 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5361 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5362 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5363 }
5364
5365 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5366 "omp_" + Name + ".tripcount");
5367}
5368
5371 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5372 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5373 ScanInfo *ScanRedInfo) {
5374 LocationDescription ComputeLoc =
5375 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5376
5378 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5379
5380 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5381 Builder.restoreIP(CodeGenIP);
5382 Value *Span = Builder.CreateMul(IV, Step);
5383 Value *IndVar = Builder.CreateAdd(Span, Start);
5384 if (InScan)
5385 ScanRedInfo->IV = IndVar;
5386 return BodyGenCB(Builder.saveIP(), IndVar);
5387 };
5388 LocationDescription LoopLoc =
5389 ComputeIP.isSet()
5390 ? Loc
5391 : LocationDescription(Builder.saveIP(),
5392 Builder.getCurrentDebugLocation());
5393 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5394}
5395
5396// Returns an LLVM function to call for initializing loop bounds using OpenMP
5397// static scheduling for composite `distribute parallel for` depending on
5398// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5399// integers as unsigned similarly to CanonicalLoopInfo.
5400static FunctionCallee
5402 OpenMPIRBuilder &OMPBuilder) {
5403 unsigned Bitwidth = Ty->getIntegerBitWidth();
5404 if (Bitwidth == 32)
5405 return OMPBuilder.getOrCreateRuntimeFunction(
5406 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5407 if (Bitwidth == 64)
5408 return OMPBuilder.getOrCreateRuntimeFunction(
5409 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5410 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5411}
5412
5413// Returns an LLVM function to call for initializing loop bounds using OpenMP
5414// static scheduling depending on `type`. Only i32 and i64 are supported by the
5415// runtime. Always interpret integers as unsigned similarly to
5416// CanonicalLoopInfo.
5418 OpenMPIRBuilder &OMPBuilder) {
5419 unsigned Bitwidth = Ty->getIntegerBitWidth();
5420 if (Bitwidth == 32)
5421 return OMPBuilder.getOrCreateRuntimeFunction(
5422 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5423 if (Bitwidth == 64)
5424 return OMPBuilder.getOrCreateRuntimeFunction(
5425 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5426 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5427}
5428
5429OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5430 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5431 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5432 OMPScheduleType DistScheduleSchedType) {
5433 assert(CLI->isValid() && "Requires a valid canonical loop");
5434 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5435 "Require dedicated allocate IP");
5436
5437 // Set up the source location value for OpenMP runtime.
5438 Builder.restoreIP(CLI->getPreheaderIP());
5439 Builder.SetCurrentDebugLocation(DL);
5440
5441 uint32_t SrcLocStrSize;
5442 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5443 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5444
5445 // Declare useful OpenMP runtime functions.
5446 Value *IV = CLI->getIndVar();
5447 Type *IVTy = IV->getType();
5448 FunctionCallee StaticInit =
5449 LoopType == WorksharingLoopType::DistributeForStaticLoop
5450 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5451 : getKmpcForStaticInitForType(IVTy, M, *this);
5452 FunctionCallee StaticFini =
5453 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5454
5455 // Allocate space for computed loop bounds as expected by the "init" function.
5456 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5457
5458 Type *I32Type = Type::getInt32Ty(M.getContext());
5459 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5460 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5461 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5462 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5463 CLI->setLastIter(PLastIter);
5464
5465 // At the end of the preheader, prepare for calling the "init" function by
5466 // storing the current loop bounds into the allocated space. A canonical loop
5467 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5468 // and produces an inclusive upper bound.
5469 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5470 Constant *Zero = ConstantInt::get(IVTy, 0);
5471 Constant *One = ConstantInt::get(IVTy, 1);
5472 Builder.CreateStore(Zero, PLowerBound);
5473 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5474 Builder.CreateStore(UpperBound, PUpperBound);
5475 Builder.CreateStore(One, PStride);
5476
5477 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5478
5479 OMPScheduleType SchedType =
5480 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5481 ? OMPScheduleType::OrderedDistribute
5483 Constant *SchedulingType =
5484 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5485
5486 // Call the "init" function and update the trip count of the loop with the
5487 // value it produced.
5488 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5489 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5490 this](Value *SchedulingType, auto &Builder) {
5491 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5492 PLowerBound, PUpperBound});
5493 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5494 Value *PDistUpperBound =
5495 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5496 Args.push_back(PDistUpperBound);
5497 }
5498 Args.append({PStride, One, Zero});
5499 createRuntimeFunctionCall(StaticInit, Args);
5500 };
5501 BuildInitCall(SchedulingType, Builder);
5502 if (HasDistSchedule &&
5503 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5504 Constant *DistScheduleSchedType = ConstantInt::get(
5505 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5506 // We want to emit a second init function call for the dist_schedule clause
5507 // to the Distribute construct. This should only be done however if a
5508 // Workshare Loop is nested within a Distribute Construct
5509 BuildInitCall(DistScheduleSchedType, Builder);
5510 }
5511 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5512 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5513 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5514 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5515 CLI->setTripCount(TripCount);
5516
5517 // Update all uses of the induction variable except the one in the condition
5518 // block that compares it with the actual upper bound, and the increment in
5519 // the latch block.
5520
5521 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5522 Builder.SetInsertPoint(CLI->getBody(),
5523 CLI->getBody()->getFirstInsertionPt());
5524 Builder.SetCurrentDebugLocation(DL);
5525 return Builder.CreateAdd(OldIV, LowerBound);
5526 });
5527
5528 // In the "exit" block, call the "fini" function.
5529 Builder.SetInsertPoint(CLI->getExit(),
5530 CLI->getExit()->getTerminator()->getIterator());
5531 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5532
5533 // Add the barrier if requested.
5534 if (NeedsBarrier) {
5535 InsertPointOrErrorTy BarrierIP =
5537 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5538 /* CheckCancelFlag */ false);
5539 if (!BarrierIP)
5540 return BarrierIP.takeError();
5541 }
5542
5543 InsertPointTy AfterIP = CLI->getAfterIP();
5544 CLI->invalidate();
5545
5546 return AfterIP;
5547}
5548
5549static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5550 LoopInfo &LI);
5551static void addLoopMetadata(CanonicalLoopInfo *Loop,
5552 ArrayRef<Metadata *> Properties);
5553
5555 LLVMContext &Ctx, Loop *Loop,
5557 SmallVector<Metadata *> &LoopMDList) {
5558 SmallSet<BasicBlock *, 8> Reachable;
5559
5560 // Get the basic blocks from the loop in which memref instructions
5561 // can be found.
5562 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5563 // preferably without running any passes.
5564 for (BasicBlock *Block : Loop->getBlocks()) {
5565 if (Block == CLI->getCond() || Block == CLI->getHeader())
5566 continue;
5567 Reachable.insert(Block);
5568 }
5569
5570 // Add access group metadata to memory-access instructions.
5571 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5572 for (BasicBlock *BB : Reachable)
5573 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5574 // TODO: If the loop has existing parallel access metadata, have
5575 // to combine two lists.
5576 LoopMDList.push_back(MDNode::get(
5577 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5578}
5579
5581OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5582 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5583 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5584 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5585 assert(CLI->isValid() && "Requires a valid canonical loop");
5586 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5587
5588 LLVMContext &Ctx = CLI->getFunction()->getContext();
5589 Value *IV = CLI->getIndVar();
5590 Value *OrigTripCount = CLI->getTripCount();
5591 Type *IVTy = IV->getType();
5592 assert(IVTy->getIntegerBitWidth() <= 64 &&
5593 "Max supported tripcount bitwidth is 64 bits");
5594 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5595 : Type::getInt64Ty(Ctx);
5596 Type *I32Type = Type::getInt32Ty(M.getContext());
5597 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5598 Constant *One = ConstantInt::get(InternalIVTy, 1);
5599
5600 Function *F = CLI->getFunction();
5601 // Blocks must have terminators.
5602 // FIXME: Don't run analyses on incomplete/invalid IR.
5604 for (BasicBlock &BB : *F)
5605 if (!BB.getTerminator())
5606 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5608 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5609 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5610 LoopAnalysis LIA;
5611 LoopInfo &&LI = LIA.run(*F, FAM);
5612 for (Instruction *I : UIs)
5613 I->eraseFromParent();
5614 Loop *L = LI.getLoopFor(CLI->getHeader());
5615 SmallVector<Metadata *> LoopMDList;
5616 if (ChunkSize || DistScheduleChunkSize)
5617 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5618 addLoopMetadata(CLI, LoopMDList);
5619
5620 // Declare useful OpenMP runtime functions.
5621 FunctionCallee StaticInit =
5622 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5623 FunctionCallee StaticFini =
5624 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5625
5626 // Allocate space for computed loop bounds as expected by the "init" function.
5627 Builder.restoreIP(AllocaIP);
5628 Builder.SetCurrentDebugLocation(DL);
5629 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5630 Value *PLowerBound =
5631 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5632 Value *PUpperBound =
5633 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5634 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5635 CLI->setLastIter(PLastIter);
5636
5637 // Set up the source location value for the OpenMP runtime.
5638 Builder.restoreIP(CLI->getPreheaderIP());
5639 Builder.SetCurrentDebugLocation(DL);
5640
5641 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5642 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5643 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5644 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5645 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5646 "distschedulechunksize");
5647 Value *CastedTripCount =
5648 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5649
5650 Constant *SchedulingType =
5651 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5652 Constant *DistSchedulingType =
5653 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5654 Builder.CreateStore(Zero, PLowerBound);
5655 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5656 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5657 Value *UpperBound =
5658 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5659 Builder.CreateStore(UpperBound, PUpperBound);
5660 Builder.CreateStore(One, PStride);
5661
5662 // Call the "init" function and update the trip count of the loop with the
5663 // value it produced.
5664 uint32_t SrcLocStrSize;
5665 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5666 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5667 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5668 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5669 PUpperBound, PStride, One,
5670 this](Value *SchedulingType, Value *ChunkSize,
5671 auto &Builder) {
5673 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5674 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5675 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5676 /*pstride=*/PStride, /*incr=*/One,
5677 /*chunk=*/ChunkSize});
5678 };
5679 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5680 if (DistScheduleSchedType != OMPScheduleType::None &&
5681 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5682 SchedType != OMPScheduleType::OrderedDistribute) {
5683 // We want to emit a second init function call for the dist_schedule clause
5684 // to the Distribute construct. This should only be done however if a
5685 // Workshare Loop is nested within a Distribute Construct
5686 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5687 }
5688
5689 // Load values written by the "init" function.
5690 Value *FirstChunkStart =
5691 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5692 Value *FirstChunkStop =
5693 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5694 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5695 Value *ChunkRange =
5696 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5697 Value *NextChunkStride =
5698 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5699
5700 // Create outer "dispatch" loop for enumerating the chunks.
5701 BasicBlock *DispatchEnter = splitBB(Builder, true);
5702 Value *DispatchCounter;
5703
5704 // It is safe to assume this didn't return an error because the callback
5705 // passed into createCanonicalLoop is the only possible error source, and it
5706 // always returns success.
5707 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5708 {Builder.saveIP(), DL},
5709 [&](InsertPointTy BodyIP, Value *Counter) {
5710 DispatchCounter = Counter;
5711 return Error::success();
5712 },
5713 FirstChunkStart, CastedTripCount, NextChunkStride,
5714 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5715 "dispatch"));
5716
5717 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5718 // not have to preserve the canonical invariant.
5719 BasicBlock *DispatchBody = DispatchCLI->getBody();
5720 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5721 BasicBlock *DispatchExit = DispatchCLI->getExit();
5722 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5723 DispatchCLI->invalidate();
5724
5725 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5726 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5727 redirectTo(CLI->getExit(), DispatchLatch, DL);
5728 redirectTo(DispatchBody, DispatchEnter, DL);
5729
5730 // Prepare the prolog of the chunk loop.
5731 Builder.restoreIP(CLI->getPreheaderIP());
5732 Builder.SetCurrentDebugLocation(DL);
5733
5734 // Compute the number of iterations of the chunk loop.
5735 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5736 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5737 Value *IsLastChunk =
5738 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5739 Value *CountUntilOrigTripCount =
5740 Builder.CreateSub(CastedTripCount, DispatchCounter);
5741 Value *ChunkTripCount = Builder.CreateSelect(
5742 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5743 Value *BackcastedChunkTC =
5744 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5745 CLI->setTripCount(BackcastedChunkTC);
5746
5747 // Update all uses of the induction variable except the one in the condition
5748 // block that compares it with the actual upper bound, and the increment in
5749 // the latch block.
5750 Value *BackcastedDispatchCounter =
5751 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5752 CLI->mapIndVar([&](Instruction *) -> Value * {
5753 Builder.restoreIP(CLI->getBodyIP());
5754 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5755 });
5756
5757 // In the "exit" block, call the "fini" function.
5758 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5759 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5760
5761 // Add the barrier if requested.
5762 if (NeedsBarrier) {
5763 InsertPointOrErrorTy AfterIP =
5764 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5765 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5766 if (!AfterIP)
5767 return AfterIP.takeError();
5768 }
5769
5770#ifndef NDEBUG
5771 // Even though we currently do not support applying additional methods to it,
5772 // the chunk loop should remain a canonical loop.
5773 CLI->assertOK();
5774#endif
5775
5776 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5777}
5778
5779// Returns an LLVM function to call for executing an OpenMP static worksharing
5780// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5781// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5782static FunctionCallee
5784 WorksharingLoopType LoopType) {
5785 unsigned Bitwidth = Ty->getIntegerBitWidth();
5786 Module &M = OMPBuilder->M;
5787 switch (LoopType) {
5788 case WorksharingLoopType::ForStaticLoop:
5789 if (Bitwidth == 32)
5790 return OMPBuilder->getOrCreateRuntimeFunction(
5791 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5792 if (Bitwidth == 64)
5793 return OMPBuilder->getOrCreateRuntimeFunction(
5794 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5795 break;
5796 case WorksharingLoopType::DistributeStaticLoop:
5797 if (Bitwidth == 32)
5798 return OMPBuilder->getOrCreateRuntimeFunction(
5799 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5800 if (Bitwidth == 64)
5801 return OMPBuilder->getOrCreateRuntimeFunction(
5802 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5803 break;
5804 case WorksharingLoopType::DistributeForStaticLoop:
5805 if (Bitwidth == 32)
5806 return OMPBuilder->getOrCreateRuntimeFunction(
5807 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5808 if (Bitwidth == 64)
5809 return OMPBuilder->getOrCreateRuntimeFunction(
5810 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5811 break;
5812 }
5813 if (Bitwidth != 32 && Bitwidth != 64) {
5814 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5815 }
5816 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5817}
5818
5819// Inserts a call to proper OpenMP Device RTL function which handles
5820// loop worksharing.
5822 WorksharingLoopType LoopType,
5823 BasicBlock *InsertBlock, Value *Ident,
5824 Value *LoopBodyArg, Value *TripCount,
5825 Function &LoopBodyFn, bool NoLoop) {
5826 Type *TripCountTy = TripCount->getType();
5827 Module &M = OMPBuilder->M;
5828 IRBuilder<> &Builder = OMPBuilder->Builder;
5829 FunctionCallee RTLFn =
5830 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5831 SmallVector<Value *, 8> RealArgs;
5832 RealArgs.push_back(Ident);
5833 RealArgs.push_back(&LoopBodyFn);
5834 RealArgs.push_back(LoopBodyArg);
5835 RealArgs.push_back(TripCount);
5836 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5837 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5838 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5839 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5840 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5841 return;
5842 }
5843 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5844 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5845 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5846 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5847
5848 RealArgs.push_back(
5849 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5850 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5851 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5852 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5853 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5854 } else {
5855 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5856 }
5857
5858 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5859}
5860
5862 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5863 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5864 WorksharingLoopType LoopType, bool NoLoop) {
5865 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5866 BasicBlock *Preheader = CLI->getPreheader();
5867 Value *TripCount = CLI->getTripCount();
5868
5869 // After loop body outling, the loop body contains only set up
5870 // of loop body argument structure and the call to the outlined
5871 // loop body function. Firstly, we need to move setup of loop body args
5872 // into loop preheader.
5873 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5874 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5875
5876 // The next step is to remove the whole loop. We do not it need anymore.
5877 // That's why make an unconditional branch from loop preheader to loop
5878 // exit block
5879 Builder.restoreIP({Preheader, Preheader->end()});
5880 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5881 Preheader->getTerminator()->eraseFromParent();
5882 Builder.CreateBr(CLI->getExit());
5883
5884 // Delete dead loop blocks
5885 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5886 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5887 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5888 CleanUpInfo.EntryBB = CLI->getHeader();
5889 CleanUpInfo.ExitBB = CLI->getExit();
5890 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5891 DeleteDeadBlocks(BlocksToBeRemoved);
5892
5893 // Find the instruction which corresponds to loop body argument structure
5894 // and remove the call to loop body function instruction.
5895 Value *LoopBodyArg;
5896 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5897 assert(OutlinedFnUser &&
5898 "Expected unique undroppable user of outlined function");
5899 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5900 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5901 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5902 "Expected outlined function call to be located in loop preheader");
5903 // Check in case no argument structure has been passed.
5904 if (OutlinedFnCallInstruction->arg_size() > 1)
5905 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5906 else
5907 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5908 OutlinedFnCallInstruction->eraseFromParent();
5909
5910 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5911 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5912
5913 for (auto &ToBeDeletedItem : ToBeDeleted)
5914 ToBeDeletedItem->eraseFromParent();
5915 CLI->invalidate();
5916}
5917
5918OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5919 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5920 WorksharingLoopType LoopType, bool NoLoop) {
5921 uint32_t SrcLocStrSize;
5922 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5923 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5924
5925 OutlineInfo OI;
5926 OI.OuterAllocaBB = CLI->getPreheader();
5927 Function *OuterFn = CLI->getPreheader()->getParent();
5928
5929 // Instructions which need to be deleted at the end of code generation
5930 SmallVector<Instruction *, 4> ToBeDeleted;
5931
5932 OI.OuterAllocaBB = AllocaIP.getBlock();
5933
5934 // Mark the body loop as region which needs to be extracted
5935 OI.EntryBB = CLI->getBody();
5936 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
5937 "omp.prelatch");
5938
5939 // Prepare loop body for extraction
5940 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5941
5942 // Insert new loop counter variable which will be used only in loop
5943 // body.
5944 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5945 Instruction *NewLoopCntLoad =
5946 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5947 // New loop counter instructions are redundant in the loop preheader when
5948 // code generation for workshare loop is finshed. That's why mark them as
5949 // ready for deletion.
5950 ToBeDeleted.push_back(NewLoopCntLoad);
5951 ToBeDeleted.push_back(NewLoopCnt);
5952
5953 // Analyse loop body region. Find all input variables which are used inside
5954 // loop body region.
5955 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5957 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5958
5959 CodeExtractorAnalysisCache CEAC(*OuterFn);
5960 CodeExtractor Extractor(Blocks,
5961 /* DominatorTree */ nullptr,
5962 /* AggregateArgs */ true,
5963 /* BlockFrequencyInfo */ nullptr,
5964 /* BranchProbabilityInfo */ nullptr,
5965 /* AssumptionCache */ nullptr,
5966 /* AllowVarArgs */ true,
5967 /* AllowAlloca */ true,
5968 /* AllocationBlock */ CLI->getPreheader(),
5969 /* Suffix */ ".omp_wsloop",
5970 /* AggrArgsIn0AddrSpace */ true);
5971
5972 BasicBlock *CommonExit = nullptr;
5973 SetVector<Value *> SinkingCands, HoistingCands;
5974
5975 // Find allocas outside the loop body region which are used inside loop
5976 // body
5977 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5978
5979 // We need to model loop body region as the function f(cnt, loop_arg).
5980 // That's why we replace loop induction variable by the new counter
5981 // which will be one of loop body function argument
5983 CLI->getIndVar()->user_end());
5984 for (auto Use : Users) {
5985 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5986 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5987 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5988 }
5989 }
5990 }
5991 // Make sure that loop counter variable is not merged into loop body
5992 // function argument structure and it is passed as separate variable
5993 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5994
5995 // PostOutline CB is invoked when loop body function is outlined and
5996 // loop body is replaced by call to outlined function. We need to add
5997 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5998 // function will handle loop control logic.
5999 //
6000 OI.PostOutlineCB = [=, ToBeDeletedVec =
6001 std::move(ToBeDeleted)](Function &OutlinedFn) {
6002 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6003 LoopType, NoLoop);
6004 };
6005 addOutlineInfo(std::move(OI));
6006 return CLI->getAfterIP();
6007}
6008
6011 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6012 bool HasSimdModifier, bool HasMonotonicModifier,
6013 bool HasNonmonotonicModifier, bool HasOrderedClause,
6014 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6015 Value *DistScheduleChunkSize) {
6016 if (Config.isTargetDevice())
6017 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6018 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6019 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6020 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6021
6022 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6023 OMPScheduleType::ModifierOrdered;
6024 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6025 if (HasDistSchedule) {
6026 DistScheduleSchedType = DistScheduleChunkSize
6027 ? OMPScheduleType::OrderedDistributeChunked
6028 : OMPScheduleType::OrderedDistribute;
6029 }
6030 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6031 case OMPScheduleType::BaseStatic:
6032 case OMPScheduleType::BaseDistribute:
6033 assert((!ChunkSize || !DistScheduleChunkSize) &&
6034 "No chunk size with static-chunked schedule");
6035 if (IsOrdered && !HasDistSchedule)
6036 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6037 NeedsBarrier, ChunkSize);
6038 // FIXME: Monotonicity ignored?
6039 if (DistScheduleChunkSize)
6040 return applyStaticChunkedWorkshareLoop(
6041 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6042 DistScheduleChunkSize, DistScheduleSchedType);
6043 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6044 HasDistSchedule);
6045
6046 case OMPScheduleType::BaseStaticChunked:
6047 case OMPScheduleType::BaseDistributeChunked:
6048 if (IsOrdered && !HasDistSchedule)
6049 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6050 NeedsBarrier, ChunkSize);
6051 // FIXME: Monotonicity ignored?
6052 return applyStaticChunkedWorkshareLoop(
6053 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6054 DistScheduleChunkSize, DistScheduleSchedType);
6055
6056 case OMPScheduleType::BaseRuntime:
6057 case OMPScheduleType::BaseAuto:
6058 case OMPScheduleType::BaseGreedy:
6059 case OMPScheduleType::BaseBalanced:
6060 case OMPScheduleType::BaseSteal:
6061 case OMPScheduleType::BaseRuntimeSimd:
6062 assert(!ChunkSize &&
6063 "schedule type does not support user-defined chunk sizes");
6064 [[fallthrough]];
6065 case OMPScheduleType::BaseGuidedSimd:
6066 case OMPScheduleType::BaseDynamicChunked:
6067 case OMPScheduleType::BaseGuidedChunked:
6068 case OMPScheduleType::BaseGuidedIterativeChunked:
6069 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6070 case OMPScheduleType::BaseStaticBalancedChunked:
6071 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6072 NeedsBarrier, ChunkSize);
6073
6074 default:
6075 llvm_unreachable("Unknown/unimplemented schedule kind");
6076 }
6077}
6078
6079/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6080/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6081/// the runtime. Always interpret integers as unsigned similarly to
6082/// CanonicalLoopInfo.
6083static FunctionCallee
6085 unsigned Bitwidth = Ty->getIntegerBitWidth();
6086 if (Bitwidth == 32)
6087 return OMPBuilder.getOrCreateRuntimeFunction(
6088 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6089 if (Bitwidth == 64)
6090 return OMPBuilder.getOrCreateRuntimeFunction(
6091 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6092 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6093}
6094
6095/// Returns an LLVM function to call for updating the next loop using OpenMP
6096/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6097/// the runtime. Always interpret integers as unsigned similarly to
6098/// CanonicalLoopInfo.
6099static FunctionCallee
6101 unsigned Bitwidth = Ty->getIntegerBitWidth();
6102 if (Bitwidth == 32)
6103 return OMPBuilder.getOrCreateRuntimeFunction(
6104 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6105 if (Bitwidth == 64)
6106 return OMPBuilder.getOrCreateRuntimeFunction(
6107 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6108 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6109}
6110
6111/// Returns an LLVM function to call for finalizing the dynamic loop using
6112/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6113/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6114static FunctionCallee
6116 unsigned Bitwidth = Ty->getIntegerBitWidth();
6117 if (Bitwidth == 32)
6118 return OMPBuilder.getOrCreateRuntimeFunction(
6119 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6120 if (Bitwidth == 64)
6121 return OMPBuilder.getOrCreateRuntimeFunction(
6122 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6123 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6124}
6125
6127OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6128 InsertPointTy AllocaIP,
6129 OMPScheduleType SchedType,
6130 bool NeedsBarrier, Value *Chunk) {
6131 assert(CLI->isValid() && "Requires a valid canonical loop");
6132 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6133 "Require dedicated allocate IP");
6135 "Require valid schedule type");
6136
6137 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6138 OMPScheduleType::ModifierOrdered;
6139
6140 // Set up the source location value for OpenMP runtime.
6141 Builder.SetCurrentDebugLocation(DL);
6142
6143 uint32_t SrcLocStrSize;
6144 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6145 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6146
6147 // Declare useful OpenMP runtime functions.
6148 Value *IV = CLI->getIndVar();
6149 Type *IVTy = IV->getType();
6150 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6151 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6152
6153 // Allocate space for computed loop bounds as expected by the "init" function.
6154 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6155 Type *I32Type = Type::getInt32Ty(M.getContext());
6156 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6157 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6158 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6159 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6160 CLI->setLastIter(PLastIter);
6161
6162 // At the end of the preheader, prepare for calling the "init" function by
6163 // storing the current loop bounds into the allocated space. A canonical loop
6164 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6165 // and produces an inclusive upper bound.
6166 BasicBlock *PreHeader = CLI->getPreheader();
6167 Builder.SetInsertPoint(PreHeader->getTerminator());
6168 Constant *One = ConstantInt::get(IVTy, 1);
6169 Builder.CreateStore(One, PLowerBound);
6170 Value *UpperBound = CLI->getTripCount();
6171 Builder.CreateStore(UpperBound, PUpperBound);
6172 Builder.CreateStore(One, PStride);
6173
6174 BasicBlock *Header = CLI->getHeader();
6175 BasicBlock *Exit = CLI->getExit();
6176 BasicBlock *Cond = CLI->getCond();
6177 BasicBlock *Latch = CLI->getLatch();
6178 InsertPointTy AfterIP = CLI->getAfterIP();
6179
6180 // The CLI will be "broken" in the code below, as the loop is no longer
6181 // a valid canonical loop.
6182
6183 if (!Chunk)
6184 Chunk = One;
6185
6186 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6187
6188 Constant *SchedulingType =
6189 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6190
6191 // Call the "init" function.
6192 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6193 /* LowerBound */ One, UpperBound,
6194 /* step */ One, Chunk});
6195
6196 // An outer loop around the existing one.
6197 BasicBlock *OuterCond = BasicBlock::Create(
6198 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6199 PreHeader->getParent());
6200 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6201 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6203 DynamicNext,
6204 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6205 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6206 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6207 Value *LowerBound =
6208 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6209 Builder.CreateCondBr(MoreWork, Header, Exit);
6210
6211 // Change PHI-node in loop header to use outer cond rather than preheader,
6212 // and set IV to the LowerBound.
6213 Instruction *Phi = &Header->front();
6214 auto *PI = cast<PHINode>(Phi);
6215 PI->setIncomingBlock(0, OuterCond);
6216 PI->setIncomingValue(0, LowerBound);
6217
6218 // Then set the pre-header to jump to the OuterCond
6219 Instruction *Term = PreHeader->getTerminator();
6220 auto *Br = cast<UncondBrInst>(Term);
6221 Br->setSuccessor(OuterCond);
6222
6223 // Modify the inner condition:
6224 // * Use the UpperBound returned from the DynamicNext call.
6225 // * jump to the loop outer loop when done with one of the inner loops.
6226 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6227 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6228 Instruction *Comp = &*Builder.GetInsertPoint();
6229 auto *CI = cast<CmpInst>(Comp);
6230 CI->setOperand(1, UpperBound);
6231 // Redirect the inner exit to branch to outer condition.
6232 Instruction *Branch = &Cond->back();
6233 auto *BI = cast<CondBrInst>(Branch);
6234 assert(BI->getSuccessor(1) == Exit);
6235 BI->setSuccessor(1, OuterCond);
6236
6237 // Call the "fini" function if "ordered" is present in wsloop directive.
6238 if (Ordered) {
6239 Builder.SetInsertPoint(&Latch->back());
6240 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6241 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6242 }
6243
6244 // Add the barrier if requested.
6245 if (NeedsBarrier) {
6246 Builder.SetInsertPoint(&Exit->back());
6247 InsertPointOrErrorTy BarrierIP =
6249 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6250 /* CheckCancelFlag */ false);
6251 if (!BarrierIP)
6252 return BarrierIP.takeError();
6253 }
6254
6255 CLI->invalidate();
6256 return AfterIP;
6257}
6258
6259/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6260/// after this \p OldTarget will be orphaned.
6262 BasicBlock *NewTarget, DebugLoc DL) {
6263 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6264 redirectTo(Pred, NewTarget, DL);
6265}
6266
6267/// Determine which blocks in \p BBs are reachable from outside and remove the
6268/// ones that are not reachable from the function.
6271 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6272 for (Use &U : BB->uses()) {
6273 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6274 if (!UseInst)
6275 continue;
6276 if (BBsToErase.count(UseInst->getParent()))
6277 continue;
6278 return true;
6279 }
6280 return false;
6281 };
6282
6283 while (BBsToErase.remove_if(HasRemainingUses)) {
6284 // Try again if anything was removed.
6285 }
6286
6287 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6288 DeleteDeadBlocks(BBVec);
6289}
6290
6291CanonicalLoopInfo *
6293 InsertPointTy ComputeIP) {
6294 assert(Loops.size() >= 1 && "At least one loop required");
6295 size_t NumLoops = Loops.size();
6296
6297 // Nothing to do if there is already just one loop.
6298 if (NumLoops == 1)
6299 return Loops.front();
6300
6301 CanonicalLoopInfo *Outermost = Loops.front();
6302 CanonicalLoopInfo *Innermost = Loops.back();
6303 BasicBlock *OrigPreheader = Outermost->getPreheader();
6304 BasicBlock *OrigAfter = Outermost->getAfter();
6305 Function *F = OrigPreheader->getParent();
6306
6307 // Loop control blocks that may become orphaned later.
6308 SmallVector<BasicBlock *, 12> OldControlBBs;
6309 OldControlBBs.reserve(6 * Loops.size());
6311 Loop->collectControlBlocks(OldControlBBs);
6312
6313 // Setup the IRBuilder for inserting the trip count computation.
6314 Builder.SetCurrentDebugLocation(DL);
6315 if (ComputeIP.isSet())
6316 Builder.restoreIP(ComputeIP);
6317 else
6318 Builder.restoreIP(Outermost->getPreheaderIP());
6319
6320 // Derive the collapsed' loop trip count.
6321 // TODO: Find common/largest indvar type.
6322 Value *CollapsedTripCount = nullptr;
6323 for (CanonicalLoopInfo *L : Loops) {
6324 assert(L->isValid() &&
6325 "All loops to collapse must be valid canonical loops");
6326 Value *OrigTripCount = L->getTripCount();
6327 if (!CollapsedTripCount) {
6328 CollapsedTripCount = OrigTripCount;
6329 continue;
6330 }
6331
6332 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6333 CollapsedTripCount =
6334 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6335 }
6336
6337 // Create the collapsed loop control flow.
6338 CanonicalLoopInfo *Result =
6339 createLoopSkeleton(DL, CollapsedTripCount, F,
6340 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6341
6342 // Build the collapsed loop body code.
6343 // Start with deriving the input loop induction variables from the collapsed
6344 // one, using a divmod scheme. To preserve the original loops' order, the
6345 // innermost loop use the least significant bits.
6346 Builder.restoreIP(Result->getBodyIP());
6347
6348 Value *Leftover = Result->getIndVar();
6349 SmallVector<Value *> NewIndVars;
6350 NewIndVars.resize(NumLoops);
6351 for (int i = NumLoops - 1; i >= 1; --i) {
6352 Value *OrigTripCount = Loops[i]->getTripCount();
6353
6354 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6355 NewIndVars[i] = NewIndVar;
6356
6357 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6358 }
6359 // Outermost loop gets all the remaining bits.
6360 NewIndVars[0] = Leftover;
6361
6362 // Construct the loop body control flow.
6363 // We progressively construct the branch structure following in direction of
6364 // the control flow, from the leading in-between code, the loop nest body, the
6365 // trailing in-between code, and rejoining the collapsed loop's latch.
6366 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6367 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6368 // its predecessors as sources.
6369 BasicBlock *ContinueBlock = Result->getBody();
6370 BasicBlock *ContinuePred = nullptr;
6371 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6372 BasicBlock *NextSrc) {
6373 if (ContinueBlock)
6374 redirectTo(ContinueBlock, Dest, DL);
6375 else
6376 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6377
6378 ContinueBlock = nullptr;
6379 ContinuePred = NextSrc;
6380 };
6381
6382 // The code before the nested loop of each level.
6383 // Because we are sinking it into the nest, it will be executed more often
6384 // that the original loop. More sophisticated schemes could keep track of what
6385 // the in-between code is and instantiate it only once per thread.
6386 for (size_t i = 0; i < NumLoops - 1; ++i)
6387 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6388
6389 // Connect the loop nest body.
6390 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6391
6392 // The code after the nested loop at each level.
6393 for (size_t i = NumLoops - 1; i > 0; --i)
6394 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6395
6396 // Connect the finished loop to the collapsed loop latch.
6397 ContinueWith(Result->getLatch(), nullptr);
6398
6399 // Replace the input loops with the new collapsed loop.
6400 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6401 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6402
6403 // Replace the input loop indvars with the derived ones.
6404 for (size_t i = 0; i < NumLoops; ++i)
6405 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6406
6407 // Remove unused parts of the input loops.
6408 removeUnusedBlocksFromParent(OldControlBBs);
6409
6410 for (CanonicalLoopInfo *L : Loops)
6411 L->invalidate();
6412
6413#ifndef NDEBUG
6414 Result->assertOK();
6415#endif
6416 return Result;
6417}
6418
6419std::vector<CanonicalLoopInfo *>
6421 ArrayRef<Value *> TileSizes) {
6422 assert(TileSizes.size() == Loops.size() &&
6423 "Must pass as many tile sizes as there are loops");
6424 int NumLoops = Loops.size();
6425 assert(NumLoops >= 1 && "At least one loop to tile required");
6426
6427 CanonicalLoopInfo *OutermostLoop = Loops.front();
6428 CanonicalLoopInfo *InnermostLoop = Loops.back();
6429 Function *F = OutermostLoop->getBody()->getParent();
6430 BasicBlock *InnerEnter = InnermostLoop->getBody();
6431 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6432
6433 // Loop control blocks that may become orphaned later.
6434 SmallVector<BasicBlock *, 12> OldControlBBs;
6435 OldControlBBs.reserve(6 * Loops.size());
6437 Loop->collectControlBlocks(OldControlBBs);
6438
6439 // Collect original trip counts and induction variable to be accessible by
6440 // index. Also, the structure of the original loops is not preserved during
6441 // the construction of the tiled loops, so do it before we scavenge the BBs of
6442 // any original CanonicalLoopInfo.
6443 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6444 for (CanonicalLoopInfo *L : Loops) {
6445 assert(L->isValid() && "All input loops must be valid canonical loops");
6446 OrigTripCounts.push_back(L->getTripCount());
6447 OrigIndVars.push_back(L->getIndVar());
6448 }
6449
6450 // Collect the code between loop headers. These may contain SSA definitions
6451 // that are used in the loop nest body. To be usable with in the innermost
6452 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6453 // these instructions may be executed more often than before the tiling.
6454 // TODO: It would be sufficient to only sink them into body of the
6455 // corresponding tile loop.
6457 for (int i = 0; i < NumLoops - 1; ++i) {
6458 CanonicalLoopInfo *Surrounding = Loops[i];
6459 CanonicalLoopInfo *Nested = Loops[i + 1];
6460
6461 BasicBlock *EnterBB = Surrounding->getBody();
6462 BasicBlock *ExitBB = Nested->getHeader();
6463 InbetweenCode.emplace_back(EnterBB, ExitBB);
6464 }
6465
6466 // Compute the trip counts of the floor loops.
6467 Builder.SetCurrentDebugLocation(DL);
6468 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6469 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6470 for (int i = 0; i < NumLoops; ++i) {
6471 Value *TileSize = TileSizes[i];
6472 Value *OrigTripCount = OrigTripCounts[i];
6473 Type *IVType = OrigTripCount->getType();
6474
6475 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6476 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6477
6478 // 0 if tripcount divides the tilesize, 1 otherwise.
6479 // 1 means we need an additional iteration for a partial tile.
6480 //
6481 // Unfortunately we cannot just use the roundup-formula
6482 // (tripcount + tilesize - 1)/tilesize
6483 // because the summation might overflow. We do not want introduce undefined
6484 // behavior when the untiled loop nest did not.
6485 Value *FloorTripOverflow =
6486 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6487
6488 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6489 Value *FloorTripCount =
6490 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6491 "omp_floor" + Twine(i) + ".tripcount", true);
6492
6493 // Remember some values for later use.
6494 FloorCompleteCount.push_back(FloorCompleteTripCount);
6495 FloorCount.push_back(FloorTripCount);
6496 FloorRems.push_back(FloorTripRem);
6497 }
6498
6499 // Generate the new loop nest, from the outermost to the innermost.
6500 std::vector<CanonicalLoopInfo *> Result;
6501 Result.reserve(NumLoops * 2);
6502
6503 // The basic block of the surrounding loop that enters the nest generated
6504 // loop.
6505 BasicBlock *Enter = OutermostLoop->getPreheader();
6506
6507 // The basic block of the surrounding loop where the inner code should
6508 // continue.
6509 BasicBlock *Continue = OutermostLoop->getAfter();
6510
6511 // Where the next loop basic block should be inserted.
6512 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6513
6514 auto EmbeddNewLoop =
6515 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6516 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6517 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6518 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6519 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6520 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6521
6522 // Setup the position where the next embedded loop connects to this loop.
6523 Enter = EmbeddedLoop->getBody();
6524 Continue = EmbeddedLoop->getLatch();
6525 OutroInsertBefore = EmbeddedLoop->getLatch();
6526 return EmbeddedLoop;
6527 };
6528
6529 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6530 const Twine &NameBase) {
6531 for (auto P : enumerate(TripCounts)) {
6532 CanonicalLoopInfo *EmbeddedLoop =
6533 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6534 Result.push_back(EmbeddedLoop);
6535 }
6536 };
6537
6538 EmbeddNewLoops(FloorCount, "floor");
6539
6540 // Within the innermost floor loop, emit the code that computes the tile
6541 // sizes.
6542 Builder.SetInsertPoint(Enter->getTerminator());
6543 SmallVector<Value *, 4> TileCounts;
6544 for (int i = 0; i < NumLoops; ++i) {
6545 CanonicalLoopInfo *FloorLoop = Result[i];
6546 Value *TileSize = TileSizes[i];
6547
6548 Value *FloorIsEpilogue =
6549 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6550 Value *TileTripCount =
6551 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6552
6553 TileCounts.push_back(TileTripCount);
6554 }
6555
6556 // Create the tile loops.
6557 EmbeddNewLoops(TileCounts, "tile");
6558
6559 // Insert the inbetween code into the body.
6560 BasicBlock *BodyEnter = Enter;
6561 BasicBlock *BodyEntered = nullptr;
6562 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6563 BasicBlock *EnterBB = P.first;
6564 BasicBlock *ExitBB = P.second;
6565
6566 if (BodyEnter)
6567 redirectTo(BodyEnter, EnterBB, DL);
6568 else
6569 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6570
6571 BodyEnter = nullptr;
6572 BodyEntered = ExitBB;
6573 }
6574
6575 // Append the original loop nest body into the generated loop nest body.
6576 if (BodyEnter)
6577 redirectTo(BodyEnter, InnerEnter, DL);
6578 else
6579 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6581
6582 // Replace the original induction variable with an induction variable computed
6583 // from the tile and floor induction variables.
6584 Builder.restoreIP(Result.back()->getBodyIP());
6585 for (int i = 0; i < NumLoops; ++i) {
6586 CanonicalLoopInfo *FloorLoop = Result[i];
6587 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6588 Value *OrigIndVar = OrigIndVars[i];
6589 Value *Size = TileSizes[i];
6590
6591 Value *Scale =
6592 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6593 Value *Shift =
6594 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6595 OrigIndVar->replaceAllUsesWith(Shift);
6596 }
6597
6598 // Remove unused parts of the original loops.
6599 removeUnusedBlocksFromParent(OldControlBBs);
6600
6601 for (CanonicalLoopInfo *L : Loops)
6602 L->invalidate();
6603
6604#ifndef NDEBUG
6605 for (CanonicalLoopInfo *GenL : Result)
6606 GenL->assertOK();
6607#endif
6608 return Result;
6609}
6610
6611/// Attach metadata \p Properties to the basic block described by \p BB. If the
6612/// basic block already has metadata, the basic block properties are appended.
6614 ArrayRef<Metadata *> Properties) {
6615 // Nothing to do if no property to attach.
6616 if (Properties.empty())
6617 return;
6618
6619 LLVMContext &Ctx = BB->getContext();
6620 SmallVector<Metadata *> NewProperties;
6621 NewProperties.push_back(nullptr);
6622
6623 // If the basic block already has metadata, prepend it to the new metadata.
6624 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6625 if (Existing)
6626 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6627
6628 append_range(NewProperties, Properties);
6629 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6630 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6631
6632 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6633}
6634
6635/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6636/// loop already has metadata, the loop properties are appended.
6638 ArrayRef<Metadata *> Properties) {
6639 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6640
6641 // Attach metadata to the loop's latch
6642 BasicBlock *Latch = Loop->getLatch();
6643 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6644 addBasicBlockMetadata(Latch, Properties);
6645}
6646
6647/// Attach llvm.access.group metadata to the memref instructions of \p Block
6649 LoopInfo &LI) {
6650 for (Instruction &I : *Block) {
6651 if (I.mayReadOrWriteMemory()) {
6652 // TODO: This instruction may already have access group from
6653 // other pragmas e.g. #pragma clang loop vectorize. Append
6654 // so that the existing metadata is not overwritten.
6655 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6656 }
6657 }
6658}
6659
6660CanonicalLoopInfo *
6662 CanonicalLoopInfo *firstLoop = Loops.front();
6663 CanonicalLoopInfo *lastLoop = Loops.back();
6664 Function *F = firstLoop->getPreheader()->getParent();
6665
6666 // Loop control blocks that will become orphaned later
6667 SmallVector<BasicBlock *> oldControlBBs;
6669 Loop->collectControlBlocks(oldControlBBs);
6670
6671 // Collect original trip counts
6672 SmallVector<Value *> origTripCounts;
6673 for (CanonicalLoopInfo *L : Loops) {
6674 assert(L->isValid() && "All input loops must be valid canonical loops");
6675 origTripCounts.push_back(L->getTripCount());
6676 }
6677
6678 Builder.SetCurrentDebugLocation(DL);
6679
6680 // Compute max trip count.
6681 // The fused loop will be from 0 to max(origTripCounts)
6682 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6683 F, firstLoop->getHeader());
6684 Builder.SetInsertPoint(TCBlock);
6685 Value *fusedTripCount = nullptr;
6686 for (CanonicalLoopInfo *L : Loops) {
6687 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6688 Value *origTripCount = L->getTripCount();
6689 if (!fusedTripCount) {
6690 fusedTripCount = origTripCount;
6691 continue;
6692 }
6693 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6694 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6695 ".omp.fuse.tc");
6696 }
6697
6698 // Generate new loop
6699 CanonicalLoopInfo *fused =
6700 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6701 lastLoop->getLatch(), "fused");
6702
6703 // Replace original loops with the fused loop
6704 // Preheader and After are not considered inside the CLI.
6705 // These are used to compute the individual TCs of the loops
6706 // so they have to be put before the resulting fused loop.
6707 // Moving them up for readability.
6708 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6709 Loops[i]->getPreheader()->moveBefore(TCBlock);
6710 Loops[i]->getAfter()->moveBefore(TCBlock);
6711 }
6712 lastLoop->getPreheader()->moveBefore(TCBlock);
6713
6714 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6715 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6716 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6717 }
6718 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6719 redirectTo(TCBlock, fused->getPreheader(), DL);
6720 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6721
6722 // Build the fused body
6723 // Create new Blocks with conditions that jump to the original loop bodies
6725 SmallVector<Value *> condValues;
6726 for (size_t i = 0; i < Loops.size(); ++i) {
6727 BasicBlock *condBlock = BasicBlock::Create(
6728 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6729 Builder.SetInsertPoint(condBlock);
6730 Value *condValue =
6731 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6732 condBBs.push_back(condBlock);
6733 condValues.push_back(condValue);
6734 }
6735 // Join the condition blocks with the bodies of the original loops
6736 redirectTo(fused->getBody(), condBBs[0], DL);
6737 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6738 Builder.SetInsertPoint(condBBs[i]);
6739 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6740 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6741 // Replace the IV with the fused IV
6742 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6743 }
6744 // Last body jumps to the created end body block
6745 Builder.SetInsertPoint(condBBs.back());
6746 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6747 fused->getLatch());
6748 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6749 // Replace the IV with the fused IV
6750 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6751
6752 // The loop latch must have only one predecessor. Currently it is branched to
6753 // from both the last condition block and the last loop body
6754 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6755 "omp.fused.pre_latch");
6756
6757 // Remove unused parts
6758 removeUnusedBlocksFromParent(oldControlBBs);
6759
6760 // Invalidate old CLIs
6761 for (CanonicalLoopInfo *L : Loops)
6762 L->invalidate();
6763
6764#ifndef NDEBUG
6765 fused->assertOK();
6766#endif
6767 return fused;
6768}
6769
6771 LLVMContext &Ctx = Builder.getContext();
6773 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6774 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6775}
6776
6778 LLVMContext &Ctx = Builder.getContext();
6780 Loop, {
6781 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6782 });
6783}
6784
6785void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6786 Value *IfCond, ValueToValueMapTy &VMap,
6787 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6788 const Twine &NamePrefix) {
6789 Function *F = CanonicalLoop->getFunction();
6790
6791 // We can't do
6792 // if (cond) {
6793 // simd_loop;
6794 // } else {
6795 // non_simd_loop;
6796 // }
6797 // because then the CanonicalLoopInfo would only point to one of the loops:
6798 // leading to other constructs operating on the same loop to malfunction.
6799 // Instead generate
6800 // while (...) {
6801 // if (cond) {
6802 // simd_body;
6803 // } else {
6804 // not_simd_body;
6805 // }
6806 // }
6807 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6808 // body at -O3
6809
6810 // Define where if branch should be inserted
6811 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6812
6813 // Create additional blocks for the if statement
6814 BasicBlock *Cond = SplitBeforeIt->getParent();
6815 llvm::LLVMContext &C = Cond->getContext();
6817 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6819 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6820
6821 // Create if condition branch.
6822 Builder.SetInsertPoint(SplitBeforeIt);
6823 Instruction *BrInstr =
6824 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6825 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6826 // Then block contains branch to omp loop body which needs to be vectorized
6827 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6828 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6829
6830 Builder.SetInsertPoint(ElseBlock);
6831
6832 // Clone loop for the else branch
6834
6835 SmallVector<BasicBlock *, 8> ExistingBlocks;
6836 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6837 ExistingBlocks.push_back(ThenBlock);
6838 ExistingBlocks.append(L->block_begin(), L->block_end());
6839 // Cond is the block that has the if clause condition
6840 // LoopCond is omp_loop.cond
6841 // LoopHeader is omp_loop.header
6842 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6843 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6844 assert(LoopCond && LoopHeader && "Invalid loop structure");
6845 for (BasicBlock *Block : ExistingBlocks) {
6846 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6847 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6848 continue;
6849 }
6850 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6851
6852 // fix name not to be omp.if.then
6853 if (Block == ThenBlock)
6854 NewBB->setName(NamePrefix + ".if.else");
6855
6856 NewBB->moveBefore(CanonicalLoop->getExit());
6857 VMap[Block] = NewBB;
6858 NewBlocks.push_back(NewBB);
6859 }
6860 remapInstructionsInBlocks(NewBlocks, VMap);
6861 Builder.CreateBr(NewBlocks.front());
6862
6863 // The loop latch must have only one predecessor. Currently it is branched to
6864 // from both the 'then' and 'else' branches.
6865 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6866 NamePrefix + ".pre_latch");
6867
6868 // Ensure that the then block is added to the loop so we add the attributes in
6869 // the next step
6870 L->addBasicBlockToLoop(ThenBlock, LI);
6871}
6872
6873unsigned
6875 const StringMap<bool> &Features) {
6876 if (TargetTriple.isX86()) {
6877 if (Features.lookup("avx512f"))
6878 return 512;
6879 else if (Features.lookup("avx"))
6880 return 256;
6881 return 128;
6882 }
6883 if (TargetTriple.isPPC())
6884 return 128;
6885 if (TargetTriple.isWasm())
6886 return 128;
6887 return 0;
6888}
6889
6891 MapVector<Value *, Value *> AlignedVars,
6892 Value *IfCond, OrderKind Order,
6893 ConstantInt *Simdlen, ConstantInt *Safelen) {
6894 LLVMContext &Ctx = Builder.getContext();
6895
6896 Function *F = CanonicalLoop->getFunction();
6897
6898 // Blocks must have terminators.
6899 // FIXME: Don't run analyses on incomplete/invalid IR.
6901 for (BasicBlock &BB : *F)
6902 if (!BB.getTerminator())
6903 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
6904
6905 // TODO: We should not rely on pass manager. Currently we use pass manager
6906 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6907 // object. We should have a method which returns all blocks between
6908 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6910 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6911 FAM.registerPass([]() { return LoopAnalysis(); });
6912 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6913
6914 LoopAnalysis LIA;
6915 LoopInfo &&LI = LIA.run(*F, FAM);
6916
6917 for (Instruction *I : UIs)
6918 I->eraseFromParent();
6919
6920 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6921 if (AlignedVars.size()) {
6922 InsertPointTy IP = Builder.saveIP();
6923 for (auto &AlignedItem : AlignedVars) {
6924 Value *AlignedPtr = AlignedItem.first;
6925 Value *Alignment = AlignedItem.second;
6926 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6927 Builder.SetInsertPoint(loadInst->getNextNode());
6928 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6929 Alignment);
6930 }
6931 Builder.restoreIP(IP);
6932 }
6933
6934 if (IfCond) {
6935 ValueToValueMapTy VMap;
6936 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6937 }
6938
6940
6941 // Get the basic blocks from the loop in which memref instructions
6942 // can be found.
6943 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6944 // preferably without running any passes.
6945 for (BasicBlock *Block : L->getBlocks()) {
6946 if (Block == CanonicalLoop->getCond() ||
6947 Block == CanonicalLoop->getHeader())
6948 continue;
6949 Reachable.insert(Block);
6950 }
6951
6952 SmallVector<Metadata *> LoopMDList;
6953
6954 // In presence of finite 'safelen', it may be unsafe to mark all
6955 // the memory instructions parallel, because loop-carried
6956 // dependences of 'safelen' iterations are possible.
6957 // If clause order(concurrent) is specified then the memory instructions
6958 // are marked parallel even if 'safelen' is finite.
6959 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6960 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6961
6962 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6963 // versions so we can't add the loop attributes in that case.
6964 if (IfCond) {
6965 // we can still add llvm.loop.parallel_access
6966 addLoopMetadata(CanonicalLoop, LoopMDList);
6967 return;
6968 }
6969
6970 // Use the above access group metadata to create loop level
6971 // metadata, which should be distinct for each loop.
6972 ConstantAsMetadata *BoolConst =
6974 LoopMDList.push_back(MDNode::get(
6975 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6976
6977 if (Simdlen || Safelen) {
6978 // If both simdlen and safelen clauses are specified, the value of the
6979 // simdlen parameter must be less than or equal to the value of the safelen
6980 // parameter. Therefore, use safelen only in the absence of simdlen.
6981 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6982 LoopMDList.push_back(
6983 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6984 ConstantAsMetadata::get(VectorizeWidth)}));
6985 }
6986
6987 addLoopMetadata(CanonicalLoop, LoopMDList);
6988}
6989
6990/// Create the TargetMachine object to query the backend for optimization
6991/// preferences.
6992///
6993/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6994/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6995/// needed for the LLVM pass pipline. We use some default options to avoid
6996/// having to pass too many settings from the frontend that probably do not
6997/// matter.
6998///
6999/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7000/// method. If we are going to use TargetMachine for more purposes, especially
7001/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7002/// might become be worth requiring front-ends to pass on their TargetMachine,
7003/// or at least cache it between methods. Note that while fontends such as Clang
7004/// have just a single main TargetMachine per translation unit, "target-cpu" and
7005/// "target-features" that determine the TargetMachine are per-function and can
7006/// be overrided using __attribute__((target("OPTIONS"))).
7007static std::unique_ptr<TargetMachine>
7009 Module *M = F->getParent();
7010
7011 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7012 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7013 const llvm::Triple &Triple = M->getTargetTriple();
7014
7015 std::string Error;
7017 if (!TheTarget)
7018 return {};
7019
7021 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7022 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7023 /*CodeModel=*/std::nullopt, OptLevel));
7024}
7025
7026/// Heuristically determine the best-performant unroll factor for \p CLI. This
7027/// depends on the target processor. We are re-using the same heuristics as the
7028/// LoopUnrollPass.
7030 Function *F = CLI->getFunction();
7031
7032 // Assume the user requests the most aggressive unrolling, even if the rest of
7033 // the code is optimized using a lower setting.
7035 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7036
7037 // Blocks must have terminators.
7038 // FIXME: Don't run analyses on incomplete/invalid IR.
7040 for (BasicBlock &BB : *F)
7041 if (!BB.getTerminator())
7042 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7043
7045 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7046 FAM.registerPass([]() { return AssumptionAnalysis(); });
7047 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7048 FAM.registerPass([]() { return LoopAnalysis(); });
7049 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7050 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7051 TargetIRAnalysis TIRA;
7052 if (TM)
7053 TIRA = TargetIRAnalysis(
7054 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7055 FAM.registerPass([&]() { return TIRA; });
7056
7057 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7059 ScalarEvolution &&SE = SEA.run(*F, FAM);
7061 DominatorTree &&DT = DTA.run(*F, FAM);
7062 LoopAnalysis LIA;
7063 LoopInfo &&LI = LIA.run(*F, FAM);
7065 AssumptionCache &&AC = ACT.run(*F, FAM);
7067
7068 for (Instruction *I : UIs)
7069 I->eraseFromParent();
7070
7071 Loop *L = LI.getLoopFor(CLI->getHeader());
7072 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7073
7075 L, SE, TTI,
7076 /*BlockFrequencyInfo=*/nullptr,
7077 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7078 /*UserThreshold=*/std::nullopt,
7079 /*UserCount=*/std::nullopt,
7080 /*UserAllowPartial=*/true,
7081 /*UserAllowRuntime=*/true,
7082 /*UserUpperBound=*/std::nullopt,
7083 /*UserFullUnrollMaxCount=*/std::nullopt);
7084
7085 UP.Force = true;
7086
7087 // Account for additional optimizations taking place before the LoopUnrollPass
7088 // would unroll the loop.
7091
7092 // Use normal unroll factors even if the rest of the code is optimized for
7093 // size.
7096
7097 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7098 << " Threshold=" << UP.Threshold << "\n"
7099 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7100 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7101 << " PartialOptSizeThreshold="
7102 << UP.PartialOptSizeThreshold << "\n");
7103
7104 // Disable peeling.
7107 /*UserAllowPeeling=*/false,
7108 /*UserAllowProfileBasedPeeling=*/false,
7109 /*UnrollingSpecficValues=*/false);
7110
7112 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7113
7114 // Assume that reads and writes to stack variables can be eliminated by
7115 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7116 // size.
7117 for (BasicBlock *BB : L->blocks()) {
7118 for (Instruction &I : *BB) {
7119 Value *Ptr;
7120 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7121 Ptr = Load->getPointerOperand();
7122 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7123 Ptr = Store->getPointerOperand();
7124 } else
7125 continue;
7126
7127 Ptr = Ptr->stripPointerCasts();
7128
7129 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7130 if (Alloca->getParent() == &F->getEntryBlock())
7131 EphValues.insert(&I);
7132 }
7133 }
7134 }
7135
7136 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7137
7138 // Loop is not unrollable if the loop contains certain instructions.
7139 if (!UCE.canUnroll()) {
7140 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7141 return 1;
7142 }
7143
7144 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7145 << "\n");
7146
7147 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7148 // be able to use it.
7149 int TripCount = 0;
7150 int MaxTripCount = 0;
7151 bool MaxOrZero = false;
7152 unsigned TripMultiple = 0;
7153
7154 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7155 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7156 unsigned Factor = UP.Count;
7157 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7158
7159 // This function returns 1 to signal to not unroll a loop.
7160 if (Factor == 0)
7161 return 1;
7162 return Factor;
7163}
7164
7166 int32_t Factor,
7167 CanonicalLoopInfo **UnrolledCLI) {
7168 assert(Factor >= 0 && "Unroll factor must not be negative");
7169
7170 Function *F = Loop->getFunction();
7171 LLVMContext &Ctx = F->getContext();
7172
7173 // If the unrolled loop is not used for another loop-associated directive, it
7174 // is sufficient to add metadata for the LoopUnrollPass.
7175 if (!UnrolledCLI) {
7176 SmallVector<Metadata *, 2> LoopMetadata;
7177 LoopMetadata.push_back(
7178 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7179
7180 if (Factor >= 1) {
7182 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7183 LoopMetadata.push_back(MDNode::get(
7184 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7185 }
7186
7187 addLoopMetadata(Loop, LoopMetadata);
7188 return;
7189 }
7190
7191 // Heuristically determine the unroll factor.
7192 if (Factor == 0)
7194
7195 // No change required with unroll factor 1.
7196 if (Factor == 1) {
7197 *UnrolledCLI = Loop;
7198 return;
7199 }
7200
7201 assert(Factor >= 2 &&
7202 "unrolling only makes sense with a factor of 2 or larger");
7203
7204 Type *IndVarTy = Loop->getIndVarType();
7205
7206 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7207 // unroll the inner loop.
7208 Value *FactorVal =
7209 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7210 /*isSigned=*/false));
7211 std::vector<CanonicalLoopInfo *> LoopNest =
7212 tileLoops(DL, {Loop}, {FactorVal});
7213 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7214 *UnrolledCLI = LoopNest[0];
7215 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7216
7217 // LoopUnrollPass can only fully unroll loops with constant trip count.
7218 // Unroll by the unroll factor with a fallback epilog for the remainder
7219 // iterations if necessary.
7221 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7223 InnerLoop,
7224 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7226 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7227
7228#ifndef NDEBUG
7229 (*UnrolledCLI)->assertOK();
7230#endif
7231}
7232
7235 llvm::Value *BufSize, llvm::Value *CpyBuf,
7236 llvm::Value *CpyFn, llvm::Value *DidIt) {
7237 if (!updateToLocation(Loc))
7238 return Loc.IP;
7239
7240 uint32_t SrcLocStrSize;
7241 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7242 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7243 Value *ThreadId = getOrCreateThreadID(Ident);
7244
7245 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7246
7247 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7248
7249 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7250 createRuntimeFunctionCall(Fn, Args);
7251
7252 return Builder.saveIP();
7253}
7254
7256 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7257 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7259
7260 if (!updateToLocation(Loc))
7261 return Loc.IP;
7262
7263 // If needed allocate and initialize `DidIt` with 0.
7264 // DidIt: flag variable: 1=single thread; 0=not single thread.
7265 llvm::Value *DidIt = nullptr;
7266 if (!CPVars.empty()) {
7267 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7268 Builder.CreateStore(Builder.getInt32(0), DidIt);
7269 }
7270
7271 Directive OMPD = Directive::OMPD_single;
7272 uint32_t SrcLocStrSize;
7273 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7274 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7275 Value *ThreadId = getOrCreateThreadID(Ident);
7276 Value *Args[] = {Ident, ThreadId};
7277
7278 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7279 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7280
7281 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7282 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7283
7284 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7285 if (Error Err = FiniCB(IP))
7286 return Err;
7287
7288 // The thread that executes the single region must set `DidIt` to 1.
7289 // This is used by __kmpc_copyprivate, to know if the caller is the
7290 // single thread or not.
7291 if (DidIt)
7292 Builder.CreateStore(Builder.getInt32(1), DidIt);
7293
7294 return Error::success();
7295 };
7296
7297 // generates the following:
7298 // if (__kmpc_single()) {
7299 // .... single region ...
7300 // __kmpc_end_single
7301 // }
7302 // __kmpc_copyprivate
7303 // __kmpc_barrier
7304
7305 InsertPointOrErrorTy AfterIP =
7306 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7307 /*Conditional*/ true,
7308 /*hasFinalize*/ true);
7309 if (!AfterIP)
7310 return AfterIP.takeError();
7311
7312 if (DidIt) {
7313 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7314 // NOTE BufSize is currently unused, so just pass 0.
7316 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7317 CPFuncs[I], DidIt);
7318 // NOTE __kmpc_copyprivate already inserts a barrier
7319 } else if (!IsNowait) {
7320 InsertPointOrErrorTy AfterIP =
7322 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7323 /* CheckCancelFlag */ false);
7324 if (!AfterIP)
7325 return AfterIP.takeError();
7326 }
7327 return Builder.saveIP();
7328}
7329
7331 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7332 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7333
7334 if (!updateToLocation(Loc))
7335 return Loc.IP;
7336
7337 Directive OMPD = Directive::OMPD_critical;
7338 uint32_t SrcLocStrSize;
7339 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7340 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7341 Value *ThreadId = getOrCreateThreadID(Ident);
7342 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7343 Value *Args[] = {Ident, ThreadId, LockVar};
7344
7345 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7346 Function *RTFn = nullptr;
7347 if (HintInst) {
7348 // Add Hint to entry Args and create call
7349 EnterArgs.push_back(HintInst);
7350 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7351 } else {
7352 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7353 }
7354 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7355
7356 Function *ExitRTLFn =
7357 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7358 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7359
7360 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7361 /*Conditional*/ false, /*hasFinalize*/ true);
7362}
7363
7366 InsertPointTy AllocaIP, unsigned NumLoops,
7367 ArrayRef<llvm::Value *> StoreValues,
7368 const Twine &Name, bool IsDependSource) {
7369 assert(
7370 llvm::all_of(StoreValues,
7371 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7372 "OpenMP runtime requires depend vec with i64 type");
7373
7374 if (!updateToLocation(Loc))
7375 return Loc.IP;
7376
7377 // Allocate space for vector and generate alloc instruction.
7378 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7379 Builder.restoreIP(AllocaIP);
7380 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7381 ArgsBase->setAlignment(Align(8));
7383
7384 // Store the index value with offset in depend vector.
7385 for (unsigned I = 0; I < NumLoops; ++I) {
7386 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7387 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7388 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7389 STInst->setAlignment(Align(8));
7390 }
7391
7392 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7393 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7394
7395 uint32_t SrcLocStrSize;
7396 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7397 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7398 Value *ThreadId = getOrCreateThreadID(Ident);
7399 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7400
7401 Function *RTLFn = nullptr;
7402 if (IsDependSource)
7403 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7404 else
7405 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7406 createRuntimeFunctionCall(RTLFn, Args);
7407
7408 return Builder.saveIP();
7409}
7410
7412 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7413 FinalizeCallbackTy FiniCB, bool IsThreads) {
7414 if (!updateToLocation(Loc))
7415 return Loc.IP;
7416
7417 Directive OMPD = Directive::OMPD_ordered;
7418 Instruction *EntryCall = nullptr;
7419 Instruction *ExitCall = nullptr;
7420
7421 if (IsThreads) {
7422 uint32_t SrcLocStrSize;
7423 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7424 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7425 Value *ThreadId = getOrCreateThreadID(Ident);
7426 Value *Args[] = {Ident, ThreadId};
7427
7428 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7429 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7430
7431 Function *ExitRTLFn =
7432 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7433 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7434 }
7435
7436 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7437 /*Conditional*/ false, /*hasFinalize*/ true);
7438}
7439
7440OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7441 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7442 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7443 bool HasFinalize, bool IsCancellable) {
7444
7445 if (HasFinalize)
7446 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7447
7448 // Create inlined region's entry and body blocks, in preparation
7449 // for conditional creation
7450 BasicBlock *EntryBB = Builder.GetInsertBlock();
7451 Instruction *SplitPos = EntryBB->getTerminator();
7453 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7454 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7455 BasicBlock *FiniBB =
7456 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7457
7458 Builder.SetInsertPoint(EntryBB->getTerminator());
7459 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7460
7461 // generate body
7462 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7463 /* CodeGenIP */ Builder.saveIP()))
7464 return Err;
7465
7466 // emit exit call and do any needed finalization.
7467 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7468 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7469 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7470 "Unexpected control flow graph state!!");
7471 InsertPointOrErrorTy AfterIP =
7472 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7473 if (!AfterIP)
7474 return AfterIP.takeError();
7475
7476 // If we are skipping the region of a non conditional, remove the exit
7477 // block, and clear the builder's insertion point.
7478 assert(SplitPos->getParent() == ExitBB &&
7479 "Unexpected Insertion point location!");
7480 auto merged = MergeBlockIntoPredecessor(ExitBB);
7481 BasicBlock *ExitPredBB = SplitPos->getParent();
7482 auto InsertBB = merged ? ExitPredBB : ExitBB;
7484 SplitPos->eraseFromParent();
7485 Builder.SetInsertPoint(InsertBB);
7486
7487 return Builder.saveIP();
7488}
7489
7490OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7491 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7492 // if nothing to do, Return current insertion point.
7493 if (!Conditional || !EntryCall)
7494 return Builder.saveIP();
7495
7496 BasicBlock *EntryBB = Builder.GetInsertBlock();
7497 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7498 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7499 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7500
7501 // Emit thenBB and set the Builder's insertion point there for
7502 // body generation next. Place the block after the current block.
7503 Function *CurFn = EntryBB->getParent();
7504 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7505
7506 // Move Entry branch to end of ThenBB, and replace with conditional
7507 // branch (If-stmt)
7508 Instruction *EntryBBTI = EntryBB->getTerminator();
7509 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7510 EntryBBTI->removeFromParent();
7511 Builder.SetInsertPoint(UI);
7512 Builder.Insert(EntryBBTI);
7513 UI->eraseFromParent();
7514 Builder.SetInsertPoint(ThenBB->getTerminator());
7515
7516 // return an insertion point to ExitBB.
7517 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7518}
7519
7520OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7521 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7522 bool HasFinalize) {
7523
7524 Builder.restoreIP(FinIP);
7525
7526 // If there is finalization to do, emit it before the exit call
7527 if (HasFinalize) {
7528 assert(!FinalizationStack.empty() &&
7529 "Unexpected finalization stack state!");
7530
7531 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7532 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7533
7534 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7535 return std::move(Err);
7536
7537 // Exit condition: insertion point is before the terminator of the new Fini
7538 // block
7539 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7540 }
7541
7542 if (!ExitCall)
7543 return Builder.saveIP();
7544
7545 // place the Exitcall as last instruction before Finalization block terminator
7546 ExitCall->removeFromParent();
7547 Builder.Insert(ExitCall);
7548
7549 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7550 ExitCall->getIterator());
7551}
7552
7554 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7555 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7556 if (!IP.isSet())
7557 return IP;
7558
7560
7561 // creates the following CFG structure
7562 // OMP_Entry : (MasterAddr != PrivateAddr)?
7563 // F T
7564 // | \
7565 // | copin.not.master
7566 // | /
7567 // v /
7568 // copyin.not.master.end
7569 // |
7570 // v
7571 // OMP.Entry.Next
7572
7573 BasicBlock *OMP_Entry = IP.getBlock();
7574 Function *CurFn = OMP_Entry->getParent();
7575 BasicBlock *CopyBegin =
7576 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7577 BasicBlock *CopyEnd = nullptr;
7578
7579 // If entry block is terminated, split to preserve the branch to following
7580 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7581 if (isa_and_nonnull<CondBrInst>(OMP_Entry->getTerminator())) {
7582 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7583 "copyin.not.master.end");
7584 OMP_Entry->getTerminator()->eraseFromParent();
7585 } else {
7586 CopyEnd =
7587 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7588 }
7589
7590 Builder.SetInsertPoint(OMP_Entry);
7591 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7592 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7593 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7594 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7595
7596 Builder.SetInsertPoint(CopyBegin);
7597 if (BranchtoEnd)
7598 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7599
7600 return Builder.saveIP();
7601}
7602
7604 Value *Size, Value *Allocator,
7605 std::string Name) {
7608
7609 uint32_t SrcLocStrSize;
7610 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7611 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7612 Value *ThreadId = getOrCreateThreadID(Ident);
7613 Value *Args[] = {ThreadId, Size, Allocator};
7614
7615 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7616
7617 return createRuntimeFunctionCall(Fn, Args, Name);
7618}
7619
7621 Value *Addr, Value *Allocator,
7622 std::string Name) {
7625
7626 uint32_t SrcLocStrSize;
7627 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7628 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7629 Value *ThreadId = getOrCreateThreadID(Ident);
7630 Value *Args[] = {ThreadId, Addr, Allocator};
7631 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7632 return createRuntimeFunctionCall(Fn, Args, Name);
7633}
7634
7636 const LocationDescription &Loc, Value *InteropVar,
7637 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7638 Value *DependenceAddress, bool HaveNowaitClause) {
7641
7642 uint32_t SrcLocStrSize;
7643 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7644 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7645 Value *ThreadId = getOrCreateThreadID(Ident);
7646 if (Device == nullptr)
7647 Device = Constant::getAllOnesValue(Int32);
7648 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7649 if (NumDependences == nullptr) {
7650 NumDependences = ConstantInt::get(Int32, 0);
7651 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7652 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7653 }
7654 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7655 Value *Args[] = {
7656 Ident, ThreadId, InteropVar, InteropTypeVal,
7657 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7658
7659 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7660
7661 return createRuntimeFunctionCall(Fn, Args);
7662}
7663
7665 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7666 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7669
7670 uint32_t SrcLocStrSize;
7671 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7672 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7673 Value *ThreadId = getOrCreateThreadID(Ident);
7674 if (Device == nullptr)
7675 Device = Constant::getAllOnesValue(Int32);
7676 if (NumDependences == nullptr) {
7677 NumDependences = ConstantInt::get(Int32, 0);
7678 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7679 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7680 }
7681 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7682 Value *Args[] = {
7683 Ident, ThreadId, InteropVar, Device,
7684 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7685
7686 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7687
7688 return createRuntimeFunctionCall(Fn, Args);
7689}
7690
7692 Value *InteropVar, Value *Device,
7693 Value *NumDependences,
7694 Value *DependenceAddress,
7695 bool HaveNowaitClause) {
7698 uint32_t SrcLocStrSize;
7699 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7700 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7701 Value *ThreadId = getOrCreateThreadID(Ident);
7702 if (Device == nullptr)
7703 Device = Constant::getAllOnesValue(Int32);
7704 if (NumDependences == nullptr) {
7705 NumDependences = ConstantInt::get(Int32, 0);
7706 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7707 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7708 }
7709 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7710 Value *Args[] = {
7711 Ident, ThreadId, InteropVar, Device,
7712 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7713
7714 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7715
7716 return createRuntimeFunctionCall(Fn, Args);
7717}
7718
7721 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7724
7725 uint32_t SrcLocStrSize;
7726 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7727 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7728 Value *ThreadId = getOrCreateThreadID(Ident);
7729 Constant *ThreadPrivateCache =
7730 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7731 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7732
7733 Function *Fn =
7734 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7735
7736 return createRuntimeFunctionCall(Fn, Args);
7737}
7738
7740 const LocationDescription &Loc,
7742 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7743 "expected num_threads and num_teams to be specified");
7744
7745 if (!updateToLocation(Loc))
7746 return Loc.IP;
7747
7748 uint32_t SrcLocStrSize;
7749 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7750 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7751 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7752 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7753 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7754 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7755 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7756
7757 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7758 Function *Kernel = DebugKernelWrapper;
7759
7760 // We need to strip the debug prefix to get the correct kernel name.
7761 StringRef KernelName = Kernel->getName();
7762 const std::string DebugPrefix = "_debug__";
7763 if (KernelName.ends_with(DebugPrefix)) {
7764 KernelName = KernelName.drop_back(DebugPrefix.length());
7765 Kernel = M.getFunction(KernelName);
7766 assert(Kernel && "Expected the real kernel to exist");
7767 }
7768
7769 // Manifest the launch configuration in the metadata matching the kernel
7770 // environment.
7771 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7772 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7773
7774 // If MaxThreads not set, select the maximum between the default workgroup
7775 // size and the MinThreads value.
7776 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7777 if (MaxThreadsVal < 0)
7778 MaxThreadsVal = std::max(
7779 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7780
7781 if (MaxThreadsVal > 0)
7782 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7783
7784 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7785 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7786 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7787 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7788 Constant *ReductionDataSize =
7789 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7790 Constant *ReductionBufferLength =
7791 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7792
7794 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7795 const DataLayout &DL = Fn->getDataLayout();
7796
7797 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7798 Constant *DynamicEnvironmentInitializer =
7799 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7800 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7801 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7802 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7803 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7804 DL.getDefaultGlobalsAddressSpace());
7805 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7806
7807 Constant *DynamicEnvironment =
7808 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7809 ? DynamicEnvironmentGV
7810 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7811 DynamicEnvironmentPtr);
7812
7813 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7814 ConfigurationEnvironment, {
7815 UseGenericStateMachineVal,
7816 MayUseNestedParallelismVal,
7817 IsSPMDVal,
7818 MinThreads,
7819 MaxThreads,
7820 MinTeams,
7821 MaxTeams,
7822 ReductionDataSize,
7823 ReductionBufferLength,
7824 });
7825 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7826 KernelEnvironment, {
7827 ConfigurationEnvironmentInitializer,
7828 Ident,
7829 DynamicEnvironment,
7830 });
7831 std::string KernelEnvironmentName =
7832 (KernelName + "_kernel_environment").str();
7833 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7834 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7835 KernelEnvironmentInitializer, KernelEnvironmentName,
7836 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7837 DL.getDefaultGlobalsAddressSpace());
7838 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7839
7840 Constant *KernelEnvironment =
7841 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7842 ? KernelEnvironmentGV
7843 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7844 KernelEnvironmentPtr);
7845 Value *KernelLaunchEnvironment =
7846 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
7847 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7848 KernelLaunchEnvironment =
7849 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7850 ? KernelLaunchEnvironment
7851 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7852 KernelLaunchEnvParamTy);
7853 CallInst *ThreadKind = createRuntimeFunctionCall(
7854 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7855
7856 Value *ExecUserCode = Builder.CreateICmpEQ(
7857 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7858 "exec_user_code");
7859
7860 // ThreadKind = __kmpc_target_init(...)
7861 // if (ThreadKind == -1)
7862 // user_code
7863 // else
7864 // return;
7865
7866 auto *UI = Builder.CreateUnreachable();
7867 BasicBlock *CheckBB = UI->getParent();
7868 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7869
7870 BasicBlock *WorkerExitBB = BasicBlock::Create(
7871 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7872 Builder.SetInsertPoint(WorkerExitBB);
7873 Builder.CreateRetVoid();
7874
7875 auto *CheckBBTI = CheckBB->getTerminator();
7876 Builder.SetInsertPoint(CheckBBTI);
7877 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7878
7879 CheckBBTI->eraseFromParent();
7880 UI->eraseFromParent();
7881
7882 // Continue in the "user_code" block, see diagram above and in
7883 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7884 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7885}
7886
7888 int32_t TeamsReductionDataSize,
7889 int32_t TeamsReductionBufferLength) {
7890 if (!updateToLocation(Loc))
7891 return;
7892
7894 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7895
7897
7898 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7899 return;
7900
7901 Function *Kernel = Builder.GetInsertBlock()->getParent();
7902 // We need to strip the debug prefix to get the correct kernel name.
7903 StringRef KernelName = Kernel->getName();
7904 const std::string DebugPrefix = "_debug__";
7905 if (KernelName.ends_with(DebugPrefix))
7906 KernelName = KernelName.drop_back(DebugPrefix.length());
7907 auto *KernelEnvironmentGV =
7908 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7909 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7910 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7911 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7912 KernelEnvironmentInitializer,
7913 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7914 NewInitializer = ConstantFoldInsertValueInstruction(
7915 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7916 {0, 8});
7917 KernelEnvironmentGV->setInitializer(NewInitializer);
7918}
7919
7920static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7921 bool Min) {
7922 if (Kernel.hasFnAttribute(Name)) {
7923 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7924 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7925 }
7926 Kernel.addFnAttr(Name, llvm::utostr(Value));
7927}
7928
7929std::pair<int32_t, int32_t>
7931 int32_t ThreadLimit =
7932 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7933
7934 if (T.isAMDGPU()) {
7935 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7936 if (!Attr.isValid() || !Attr.isStringAttribute())
7937 return {0, ThreadLimit};
7938 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7939 int32_t LB, UB;
7940 if (!llvm::to_integer(UBStr, UB, 10))
7941 return {0, ThreadLimit};
7942 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7943 if (!llvm::to_integer(LBStr, LB, 10))
7944 return {0, UB};
7945 return {LB, UB};
7946 }
7947
7948 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7949 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7950 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7951 }
7952 return {0, ThreadLimit};
7953}
7954
7956 Function &Kernel, int32_t LB,
7957 int32_t UB) {
7958 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7959
7960 if (T.isAMDGPU()) {
7961 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7962 llvm::utostr(LB) + "," + llvm::utostr(UB));
7963 return;
7964 }
7965
7966 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7967}
7968
7969std::pair<int32_t, int32_t>
7971 // TODO: Read from backend annotations if available.
7972 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7973}
7974
7976 int32_t LB, int32_t UB) {
7977 if (T.isNVPTX())
7978 if (UB > 0)
7979 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7980 if (T.isAMDGPU())
7981 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7982
7983 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7984}
7985
7986void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7987 Function *OutlinedFn) {
7988 if (Config.isTargetDevice()) {
7990 // TODO: Determine if DSO local can be set to true.
7991 OutlinedFn->setDSOLocal(false);
7993 if (T.isAMDGCN())
7995 else if (T.isNVPTX())
7997 else if (T.isSPIRV())
7999 }
8000}
8001
8002Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8003 StringRef EntryFnIDName) {
8004 if (Config.isTargetDevice()) {
8005 assert(OutlinedFn && "The outlined function must exist if embedded");
8006 return OutlinedFn;
8007 }
8008
8009 return new GlobalVariable(
8010 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8011 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8012}
8013
8014Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8015 StringRef EntryFnName) {
8016 if (OutlinedFn)
8017 return OutlinedFn;
8018
8019 assert(!M.getGlobalVariable(EntryFnName, true) &&
8020 "Named kernel already exists?");
8021 return new GlobalVariable(
8022 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8023 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8024}
8025
8027 TargetRegionEntryInfo &EntryInfo,
8028 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8029 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8030
8031 SmallString<64> EntryFnName;
8032 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8033
8034 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8035 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8036 if (!CBResult)
8037 return CBResult.takeError();
8038 OutlinedFn = *CBResult;
8039 } else {
8040 OutlinedFn = nullptr;
8041 }
8042
8043 // If this target outline function is not an offload entry, we don't need to
8044 // register it. This may be in the case of a false if clause, or if there are
8045 // no OpenMP targets.
8046 if (!IsOffloadEntry)
8047 return Error::success();
8048
8049 std::string EntryFnIDName =
8050 Config.isTargetDevice()
8051 ? std::string(EntryFnName)
8052 : createPlatformSpecificName({EntryFnName, "region_id"});
8053
8054 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8055 EntryFnName, EntryFnIDName);
8056 return Error::success();
8057}
8058
8060 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8061 StringRef EntryFnName, StringRef EntryFnIDName) {
8062 if (OutlinedFn)
8063 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8064 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8065 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8066 OffloadInfoManager.registerTargetRegionEntryInfo(
8067 EntryInfo, EntryAddr, OutlinedFnID,
8069 return OutlinedFnID;
8070}
8071
8073 const LocationDescription &Loc, InsertPointTy AllocaIP,
8074 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8075 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8076 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8078 BodyGenTy BodyGenType)>
8079 BodyGenCB,
8080 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8081 if (!updateToLocation(Loc))
8082 return InsertPointTy();
8083
8084 Builder.restoreIP(CodeGenIP);
8085
8086 bool IsStandAlone = !BodyGenCB;
8087 MapInfosTy *MapInfo;
8088 // Generate the code for the opening of the data environment. Capture all the
8089 // arguments of the runtime call by reference because they are used in the
8090 // closing of the region.
8091 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8092 InsertPointTy CodeGenIP) -> Error {
8093 MapInfo = &GenMapInfoCB(Builder.saveIP());
8094 if (Error Err = emitOffloadingArrays(
8095 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8096 /*IsNonContiguous=*/true, DeviceAddrCB))
8097 return Err;
8098
8099 TargetDataRTArgs RTArgs;
8101
8102 // Emit the number of elements in the offloading arrays.
8103 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8104
8105 // Source location for the ident struct
8106 if (!SrcLocInfo) {
8107 uint32_t SrcLocStrSize;
8108 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8109 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8110 }
8111
8112 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8113 SrcLocInfo, DeviceID,
8114 PointerNum, RTArgs.BasePointersArray,
8115 RTArgs.PointersArray, RTArgs.SizesArray,
8116 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8117 RTArgs.MappersArray};
8118
8119 if (IsStandAlone) {
8120 assert(MapperFunc && "MapperFunc missing for standalone target data");
8121
8122 auto TaskBodyCB = [&](Value *, Value *,
8124 if (Info.HasNoWait) {
8125 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8129 }
8130
8132 OffloadingArgs);
8133
8134 if (Info.HasNoWait) {
8135 BasicBlock *OffloadContBlock =
8136 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8137 Function *CurFn = Builder.GetInsertBlock()->getParent();
8138 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8139 Builder.restoreIP(Builder.saveIP());
8140 }
8141 return Error::success();
8142 };
8143
8144 bool RequiresOuterTargetTask = Info.HasNoWait;
8145 if (!RequiresOuterTargetTask)
8146 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8147 /*TargetTaskAllocaIP=*/{}));
8148 else
8149 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8150 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8151 } else {
8152 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8153 omp::OMPRTL___tgt_target_data_begin_mapper);
8154
8155 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8156
8157 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8158 if (isa<AllocaInst>(DeviceMap.second.second)) {
8159 auto *LI =
8160 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8161 Builder.CreateStore(LI, DeviceMap.second.second);
8162 }
8163 }
8164
8165 // If device pointer privatization is required, emit the body of the
8166 // region here. It will have to be duplicated: with and without
8167 // privatization.
8168 InsertPointOrErrorTy AfterIP =
8169 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8170 if (!AfterIP)
8171 return AfterIP.takeError();
8172 Builder.restoreIP(*AfterIP);
8173 }
8174 return Error::success();
8175 };
8176
8177 // If we need device pointer privatization, we need to emit the body of the
8178 // region with no privatization in the 'else' branch of the conditional.
8179 // Otherwise, we don't have to do anything.
8180 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8181 InsertPointTy CodeGenIP) -> Error {
8182 InsertPointOrErrorTy AfterIP =
8183 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8184 if (!AfterIP)
8185 return AfterIP.takeError();
8186 Builder.restoreIP(*AfterIP);
8187 return Error::success();
8188 };
8189
8190 // Generate code for the closing of the data region.
8191 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8192 TargetDataRTArgs RTArgs;
8193 Info.EmitDebug = !MapInfo->Names.empty();
8194 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8195
8196 // Emit the number of elements in the offloading arrays.
8197 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8198
8199 // Source location for the ident struct
8200 if (!SrcLocInfo) {
8201 uint32_t SrcLocStrSize;
8202 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8203 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8204 }
8205
8206 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8207 PointerNum, RTArgs.BasePointersArray,
8208 RTArgs.PointersArray, RTArgs.SizesArray,
8209 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8210 RTArgs.MappersArray};
8211 Function *EndMapperFunc =
8212 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8213
8214 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8215 return Error::success();
8216 };
8217
8218 // We don't have to do anything to close the region if the if clause evaluates
8219 // to false.
8220 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8221 return Error::success();
8222 };
8223
8224 Error Err = [&]() -> Error {
8225 if (BodyGenCB) {
8226 Error Err = [&]() {
8227 if (IfCond)
8228 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8229 return BeginThenGen(AllocaIP, Builder.saveIP());
8230 }();
8231
8232 if (Err)
8233 return Err;
8234
8235 // If we don't require privatization of device pointers, we emit the body
8236 // in between the runtime calls. This avoids duplicating the body code.
8237 InsertPointOrErrorTy AfterIP =
8238 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8239 if (!AfterIP)
8240 return AfterIP.takeError();
8241 restoreIPandDebugLoc(Builder, *AfterIP);
8242
8243 if (IfCond)
8244 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8245 return EndThenGen(AllocaIP, Builder.saveIP());
8246 }
8247 if (IfCond)
8248 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8249 return BeginThenGen(AllocaIP, Builder.saveIP());
8250 }();
8251
8252 if (Err)
8253 return Err;
8254
8255 return Builder.saveIP();
8256}
8257
8260 bool IsGPUDistribute) {
8261 assert((IVSize == 32 || IVSize == 64) &&
8262 "IV size is not compatible with the omp runtime");
8263 RuntimeFunction Name;
8264 if (IsGPUDistribute)
8265 Name = IVSize == 32
8266 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8267 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8268 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8269 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8270 else
8271 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8272 : omp::OMPRTL___kmpc_for_static_init_4u)
8273 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8274 : omp::OMPRTL___kmpc_for_static_init_8u);
8275
8276 return getOrCreateRuntimeFunction(M, Name);
8277}
8278
8280 bool IVSigned) {
8281 assert((IVSize == 32 || IVSize == 64) &&
8282 "IV size is not compatible with the omp runtime");
8283 RuntimeFunction Name = IVSize == 32
8284 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8285 : omp::OMPRTL___kmpc_dispatch_init_4u)
8286 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8287 : omp::OMPRTL___kmpc_dispatch_init_8u);
8288
8289 return getOrCreateRuntimeFunction(M, Name);
8290}
8291
8293 bool IVSigned) {
8294 assert((IVSize == 32 || IVSize == 64) &&
8295 "IV size is not compatible with the omp runtime");
8296 RuntimeFunction Name = IVSize == 32
8297 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8298 : omp::OMPRTL___kmpc_dispatch_next_4u)
8299 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8300 : omp::OMPRTL___kmpc_dispatch_next_8u);
8301
8302 return getOrCreateRuntimeFunction(M, Name);
8303}
8304
8306 bool IVSigned) {
8307 assert((IVSize == 32 || IVSize == 64) &&
8308 "IV size is not compatible with the omp runtime");
8309 RuntimeFunction Name = IVSize == 32
8310 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8311 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8312 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8313 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8314
8315 return getOrCreateRuntimeFunction(M, Name);
8316}
8317
8319 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8320}
8321
8323 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8324 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8325
8326 DISubprogram *NewSP = Func->getSubprogram();
8327 if (!NewSP)
8328 return;
8329
8331
8332 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8333 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8334 // Only use cached variable if the arg number matches. This is important
8335 // so that DIVariable created for privatized variables are not discarded.
8336 if (NewVar && (arg == NewVar->getArg()))
8337 return NewVar;
8338
8340 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8341 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8342 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8343 return NewVar;
8344 };
8345
8346 auto UpdateDebugRecord = [&](auto *DR) {
8347 DILocalVariable *OldVar = DR->getVariable();
8348 unsigned ArgNo = 0;
8349 for (auto Loc : DR->location_ops()) {
8350 auto Iter = ValueReplacementMap.find(Loc);
8351 if (Iter != ValueReplacementMap.end()) {
8352 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8353 ArgNo = std::get<1>(Iter->second) + 1;
8354 }
8355 }
8356 if (ArgNo != 0)
8357 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8358 };
8359
8360 // The location and scope of variable intrinsics and records still point to
8361 // the parent function of the target region. Update them.
8362 for (Instruction &I : instructions(Func)) {
8364 "Unexpected debug intrinsic");
8365 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
8366 UpdateDebugRecord(&DVR);
8367 }
8368 // An extra argument is passed to the device. Create the debug data for it.
8369 if (OMPBuilder.Config.isTargetDevice()) {
8370 DICompileUnit *CU = NewSP->getUnit();
8371 Module *M = Func->getParent();
8372 DIBuilder DB(*M, true, CU);
8373 DIType *VoidPtrTy =
8374 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8375 unsigned ArgNo = Func->arg_size();
8376 DILocalVariable *Var = DB.createParameterVariable(
8377 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8378 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8379 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8380 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8381 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8382 &(*Func->begin()));
8383 }
8384}
8385
8387 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8388 return cast<Operator>(V)->getOperand(0);
8389 return V;
8390}
8391
8393 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8395 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8398 SmallVector<Type *> ParameterTypes;
8399 if (OMPBuilder.Config.isTargetDevice()) {
8400 // All parameters to target devices are passed as pointers
8401 // or i64. This assumes 64-bit address spaces/pointers.
8402 for (auto &Arg : Inputs)
8403 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8404 ? Arg->getType()
8405 : Type::getInt64Ty(Builder.getContext()));
8406 } else {
8407 for (auto &Arg : Inputs)
8408 ParameterTypes.push_back(Arg->getType());
8409 }
8410
8411 // The implicit dyn_ptr argument is always the last parameter on both host
8412 // and device so the argument counts match without runtime manipulation.
8413 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8414 ParameterTypes.push_back(PtrTy);
8415
8416 auto BB = Builder.GetInsertBlock();
8417 auto M = BB->getModule();
8418 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8419 /*isVarArg*/ false);
8420 auto Func =
8421 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8422
8423 // Forward target-cpu and target-features function attributes from the
8424 // original function to the new outlined function.
8425 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8426
8427 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8428 if (TargetCpuAttr.isStringAttribute())
8429 Func->addFnAttr(TargetCpuAttr);
8430
8431 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8432 if (TargetFeaturesAttr.isStringAttribute())
8433 Func->addFnAttr(TargetFeaturesAttr);
8434
8435 if (OMPBuilder.Config.isTargetDevice()) {
8436 Value *ExecMode =
8437 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8438 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8439 }
8440
8441 // Save insert point.
8442 IRBuilder<>::InsertPointGuard IPG(Builder);
8443 // We will generate the entries in the outlined function but the debug
8444 // location may still be pointing to the parent function. Reset it now.
8445 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8446
8447 // Generate the region into the function.
8448 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8449 Builder.SetInsertPoint(EntryBB);
8450
8451 // Insert target init call in the device compilation pass.
8452 if (OMPBuilder.Config.isTargetDevice())
8453 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8454
8455 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8456
8457 // As we embed the user code in the middle of our target region after we
8458 // generate entry code, we must move what allocas we can into the entry
8459 // block to avoid possible breaking optimisations for device
8460 if (OMPBuilder.Config.isTargetDevice())
8462
8463 // Insert target deinit call in the device compilation pass.
8464 BasicBlock *OutlinedBodyBB =
8465 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8467 Builder.saveIP(),
8468 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8469 if (!AfterIP)
8470 return AfterIP.takeError();
8471 Builder.restoreIP(*AfterIP);
8472 if (OMPBuilder.Config.isTargetDevice())
8473 OMPBuilder.createTargetDeinit(Builder);
8474
8475 // Insert return instruction.
8476 Builder.CreateRetVoid();
8477
8478 // New Alloca IP at entry point of created device function.
8479 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8480 auto AllocaIP = Builder.saveIP();
8481
8482 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8483
8484 // Do not include the artificial dyn_ptr argument.
8485 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8486
8488
8489 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8490 // Things like GEP's can come in the form of Constants. Constants and
8491 // ConstantExpr's do not have access to the knowledge of what they're
8492 // contained in, so we must dig a little to find an instruction so we
8493 // can tell if they're used inside of the function we're outlining. We
8494 // also replace the original constant expression with a new instruction
8495 // equivalent; an instruction as it allows easy modification in the
8496 // following loop, as we can now know the constant (instruction) is
8497 // owned by our target function and replaceUsesOfWith can now be invoked
8498 // on it (cannot do this with constants it seems). A brand new one also
8499 // allows us to be cautious as it is perhaps possible the old expression
8500 // was used inside of the function but exists and is used externally
8501 // (unlikely by the nature of a Constant, but still).
8502 // NOTE: We cannot remove dead constants that have been rewritten to
8503 // instructions at this stage, we run the risk of breaking later lowering
8504 // by doing so as we could still be in the process of lowering the module
8505 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8506 // constants we have created rewritten versions of.
8507 if (auto *Const = dyn_cast<Constant>(Input))
8508 convertUsersOfConstantsToInstructions(Const, Func, false);
8509
8510 // Collect users before iterating over them to avoid invalidating the
8511 // iteration in case a user uses Input more than once (e.g. a call
8512 // instruction).
8513 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8514 // Collect all the instructions
8516 if (auto *Instr = dyn_cast<Instruction>(User))
8517 if (Instr->getFunction() == Func)
8518 Instr->replaceUsesOfWith(Input, InputCopy);
8519 };
8520
8521 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8522
8523 // Rewrite uses of input valus to parameters.
8524 for (auto InArg : zip(Inputs, ArgRange)) {
8525 Value *Input = std::get<0>(InArg);
8526 Argument &Arg = std::get<1>(InArg);
8527 Value *InputCopy = nullptr;
8528
8530 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8531 if (!AfterIP)
8532 return AfterIP.takeError();
8533 Builder.restoreIP(*AfterIP);
8534 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8535
8536 // In certain cases a Global may be set up for replacement, however, this
8537 // Global may be used in multiple arguments to the kernel, just segmented
8538 // apart, for example, if we have a global array, that is sectioned into
8539 // multiple mappings (technically not legal in OpenMP, but there is a case
8540 // in Fortran for Common Blocks where this is neccesary), we will end up
8541 // with GEP's into this array inside the kernel, that refer to the Global
8542 // but are technically separate arguments to the kernel for all intents and
8543 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8544 // index, it will fold into an referal to the Global, if we then encounter
8545 // this folded GEP during replacement all of the references to the
8546 // Global in the kernel will be replaced with the argument we have generated
8547 // that corresponds to it, including any other GEP's that refer to the
8548 // Global that may be other arguments. This will invalidate all of the other
8549 // preceding mapped arguments that refer to the same global that may be
8550 // separate segments. To prevent this, we defer global processing until all
8551 // other processing has been performed.
8554 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8555 continue;
8556 }
8557
8559 continue;
8560
8561 ReplaceValue(Input, InputCopy, Func);
8562 }
8563
8564 // Replace all of our deferred Input values, currently just Globals.
8565 for (auto Deferred : DeferredReplacement)
8566 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8567
8568 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8569 ValueReplacementMap);
8570 return Func;
8571}
8572/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8573/// of pointers containing shared data between the parent task and the created
8574/// task.
8576 IRBuilderBase &Builder,
8577 Value *TaskWithPrivates,
8578 Type *TaskWithPrivatesTy) {
8579
8580 Type *TaskTy = OMPIRBuilder.Task;
8581 LLVMContext &Ctx = Builder.getContext();
8582 Value *TaskT =
8583 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8584 Value *Shareds = TaskT;
8585 // TaskWithPrivatesTy can be one of the following
8586 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8587 // %struct.privates }
8588 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8589 //
8590 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8591 // its first member has to be the task descriptor. TaskTy is the type of the
8592 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8593 // first member of TaskT, gives us the pointer to shared data.
8594 if (TaskWithPrivatesTy != TaskTy)
8595 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8596 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8597}
8598/// Create an entry point for a target task with the following.
8599/// It'll have the following signature
8600/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8601/// This function is called from emitTargetTask once the
8602/// code to launch the target kernel has been outlined already.
8603/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8604/// into the task structure so that the deferred target task can access this
8605/// data even after the stack frame of the generating task has been rolled
8606/// back. Offloading arrays contain base pointers, pointers, sizes etc
8607/// of the data that the target kernel will access. These in effect are the
8608/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8610 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8611 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8612 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8613
8614 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8615 // This is because PrivatesTy is the type of the structure in which
8616 // we pass the offloading arrays to the deferred target task.
8617 assert((!NumOffloadingArrays || PrivatesTy) &&
8618 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8619 "to privatize");
8620
8621 Module &M = OMPBuilder.M;
8622 // KernelLaunchFunction is the target launch function, i.e.
8623 // the function that sets up kernel arguments and calls
8624 // __tgt_target_kernel to launch the kernel on the device.
8625 //
8626 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8627
8628 // StaleCI is the CallInst which is the call to the outlined
8629 // target kernel launch function. If there are local live-in values
8630 // that the outlined function uses then these are aggregated into a structure
8631 // which is passed as the second argument. If there are no local live-in
8632 // values or if all values used by the outlined kernel are global variables,
8633 // then there's only one argument, the threadID. So, StaleCI can be
8634 //
8635 // %structArg = alloca { ptr, ptr }, align 8
8636 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8637 // store ptr %20, ptr %gep_, align 8
8638 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8639 // store ptr %21, ptr %gep_8, align 8
8640 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8641 //
8642 // OR
8643 //
8644 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8646 StaleCI->getIterator());
8647
8648 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8649
8650 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8651 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8652 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8653
8654 auto ProxyFnTy =
8655 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8656 /* isVarArg */ false);
8657 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8658 ".omp_target_task_proxy_func",
8659 Builder.GetInsertBlock()->getModule());
8660 Value *ThreadId = ProxyFn->getArg(0);
8661 Value *TaskWithPrivates = ProxyFn->getArg(1);
8662 ThreadId->setName("thread.id");
8663 TaskWithPrivates->setName("task");
8664
8665 bool HasShareds = SharedArgsOperandNo > 0;
8666 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8667 BasicBlock *EntryBB =
8668 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8669 Builder.SetInsertPoint(EntryBB);
8670
8671 SmallVector<Value *> KernelLaunchArgs;
8672 KernelLaunchArgs.reserve(StaleCI->arg_size());
8673 KernelLaunchArgs.push_back(ThreadId);
8674
8675 if (HasOffloadingArrays) {
8676 assert(TaskTy != TaskWithPrivatesTy &&
8677 "If there are offloading arrays to pass to the target"
8678 "TaskTy cannot be the same as TaskWithPrivatesTy");
8679 (void)TaskTy;
8680 Value *Privates =
8681 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8682 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8683 KernelLaunchArgs.push_back(
8684 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8685 }
8686
8687 if (HasShareds) {
8688 auto *ArgStructAlloca =
8689 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8690 assert(ArgStructAlloca &&
8691 "Unable to find the alloca instruction corresponding to arguments "
8692 "for extracted function");
8693 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8694 std::optional<TypeSize> ArgAllocSize =
8695 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8696 assert(ArgStructType && ArgAllocSize &&
8697 "Unable to determine size of arguments for extracted function");
8698 uint64_t StructSize = ArgAllocSize->getFixedValue();
8699
8700 AllocaInst *NewArgStructAlloca =
8701 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8702
8703 Value *SharedsSize = Builder.getInt64(StructSize);
8704
8706 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8707
8708 Builder.CreateMemCpy(
8709 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8710 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8711 KernelLaunchArgs.push_back(NewArgStructAlloca);
8712 }
8713 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8714 Builder.CreateRetVoid();
8715 return ProxyFn;
8716}
8718
8719 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8720 return GEP->getSourceElementType();
8721 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8722 return Alloca->getAllocatedType();
8723
8724 llvm_unreachable("Unhandled Instruction type");
8725 return nullptr;
8726}
8727// This function returns a struct that has at most two members.
8728// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8729// descriptor. The second member, if needed, is a struct containing arrays
8730// that need to be passed to the offloaded target kernel. For example,
8731// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8732// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8733// respectively, then the types created by this function are
8734//
8735// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8736// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8737// %struct.privates }
8738// %struct.task_with_privates is returned by this function.
8739// If there aren't any offloading arrays to pass to the target kernel,
8740// %struct.kmp_task_ompbuilder_t is returned.
8741static StructType *
8743 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8744
8745 if (OffloadingArraysToPrivatize.empty())
8746 return OMPIRBuilder.Task;
8747
8748 SmallVector<Type *, 4> StructFieldTypes;
8749 for (Value *V : OffloadingArraysToPrivatize) {
8750 assert(V->getType()->isPointerTy() &&
8751 "Expected pointer to array to privatize. Got a non-pointer value "
8752 "instead");
8753 Type *ArrayTy = getOffloadingArrayType(V);
8754 assert(ArrayTy && "ArrayType cannot be nullptr");
8755 StructFieldTypes.push_back(ArrayTy);
8756 }
8757 StructType *PrivatesStructTy =
8758 StructType::create(StructFieldTypes, "struct.privates");
8759 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8760 "struct.task_with_privates");
8761}
8763 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8764 TargetRegionEntryInfo &EntryInfo,
8766 Function *&OutlinedFn, Constant *&OutlinedFnID,
8770
8771 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8772 [&](StringRef EntryFnName) {
8773 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8774 EntryFnName, Inputs, CBFunc,
8775 ArgAccessorFuncCB);
8776 };
8777
8778 return OMPBuilder.emitTargetRegionFunction(
8779 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8780 OutlinedFnID);
8781}
8782
8784 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8787 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8788
8789 // The following explains the code-gen scenario for the `target` directive. A
8790 // similar scneario is followed for other device-related directives (e.g.
8791 // `target enter data`) but in similar fashion since we only need to emit task
8792 // that encapsulates the proper runtime call.
8793 //
8794 // When we arrive at this function, the target region itself has been
8795 // outlined into the function OutlinedFn.
8796 // So at ths point, for
8797 // --------------------------------------------------------------
8798 // void user_code_that_offloads(...) {
8799 // omp target depend(..) map(from:a) map(to:b) private(i)
8800 // do i = 1, 10
8801 // a(i) = b(i) + n
8802 // }
8803 //
8804 // --------------------------------------------------------------
8805 //
8806 // we have
8807 //
8808 // --------------------------------------------------------------
8809 //
8810 // void user_code_that_offloads(...) {
8811 // %.offload_baseptrs = alloca [2 x ptr], align 8
8812 // %.offload_ptrs = alloca [2 x ptr], align 8
8813 // %.offload_mappers = alloca [2 x ptr], align 8
8814 // ;; target region has been outlined and now we need to
8815 // ;; offload to it via a target task.
8816 // }
8817 // void outlined_device_function(ptr a, ptr b, ptr n) {
8818 // n = *n_ptr;
8819 // do i = 1, 10
8820 // a(i) = b(i) + n
8821 // }
8822 //
8823 // We have to now do the following
8824 // (i) Make an offloading call to outlined_device_function using the OpenMP
8825 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8826 // emitted by emitKernelLaunch
8827 // (ii) Create a task entry point function that calls kernel_launch_function
8828 // and is the entry point for the target task. See
8829 // '@.omp_target_task_proxy_func in the pseudocode below.
8830 // (iii) Create a task with the task entry point created in (ii)
8831 //
8832 // That is we create the following
8833 // struct task_with_privates {
8834 // struct kmp_task_ompbuilder_t task_struct;
8835 // struct privates {
8836 // [2 x ptr] ; baseptrs
8837 // [2 x ptr] ; ptrs
8838 // [2 x i64] ; sizes
8839 // }
8840 // }
8841 // void user_code_that_offloads(...) {
8842 // %.offload_baseptrs = alloca [2 x ptr], align 8
8843 // %.offload_ptrs = alloca [2 x ptr], align 8
8844 // %.offload_sizes = alloca [2 x i64], align 8
8845 //
8846 // %structArg = alloca { ptr, ptr, ptr }, align 8
8847 // %strucArg[0] = a
8848 // %strucArg[1] = b
8849 // %strucArg[2] = &n
8850 //
8851 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8852 // sizeof(kmp_task_ompbuilder_t),
8853 // sizeof(structArg),
8854 // @.omp_target_task_proxy_func,
8855 // ...)
8856 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8857 // sizeof(structArg))
8858 // memcpy(target_task_with_privates->privates->baseptrs,
8859 // offload_baseptrs, sizeof(offload_baseptrs)
8860 // memcpy(target_task_with_privates->privates->ptrs,
8861 // offload_ptrs, sizeof(offload_ptrs)
8862 // memcpy(target_task_with_privates->privates->sizes,
8863 // offload_sizes, sizeof(offload_sizes)
8864 // dependencies_array = ...
8865 // ;; if nowait not present
8866 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8867 // call @__kmpc_omp_task_begin_if0(...)
8868 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8869 // %target_task_with_privates)
8870 // call @__kmpc_omp_task_complete_if0(...)
8871 // }
8872 //
8873 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8874 // ptr %task) {
8875 // %structArg = alloca {ptr, ptr, ptr}
8876 // %task_ptr = getelementptr(%task, 0, 0)
8877 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8878 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8879 //
8880 // %offloading_arrays = getelementptr(%task, 0, 1)
8881 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8882 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8883 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8884 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8885 // %offload_sizes, %structArg)
8886 // }
8887 //
8888 // We need the proxy function because the signature of the task entry point
8889 // expected by kmpc_omp_task is always the same and will be different from
8890 // that of the kernel_launch function.
8891 //
8892 // kernel_launch_function is generated by emitKernelLaunch and has the
8893 // always_inline attribute. For this example, it'll look like so:
8894 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8895 // %offload_sizes, %structArg) alwaysinline {
8896 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8897 // ; load aggregated data from %structArg
8898 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8899 // ; offload_sizes
8900 // call i32 @__tgt_target_kernel(...,
8901 // outlined_device_function,
8902 // ptr %kernel_args)
8903 // }
8904 // void outlined_device_function(ptr a, ptr b, ptr n) {
8905 // n = *n_ptr;
8906 // do i = 1, 10
8907 // a(i) = b(i) + n
8908 // }
8909 //
8910 BasicBlock *TargetTaskBodyBB =
8911 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8912 BasicBlock *TargetTaskAllocaBB =
8913 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8914
8915 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8916 TargetTaskAllocaBB->begin());
8917 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8918
8919 OutlineInfo OI;
8920 OI.EntryBB = TargetTaskAllocaBB;
8921 OI.OuterAllocaBB = AllocaIP.getBlock();
8922
8923 // Add the thread ID argument.
8926 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8927
8928 // Generate the task body which will subsequently be outlined.
8929 Builder.restoreIP(TargetTaskBodyIP);
8930 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8931 return Err;
8932
8933 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8934 // it is given. These blocks are enumerated by
8935 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8936 // to be outside the region. In other words, OI.ExitBlock is expected to be
8937 // the start of the region after the outlining. We used to set OI.ExitBlock
8938 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8939 // except when the task body is a single basic block. In that case,
8940 // OI.ExitBlock is set to the single task body block and will get left out of
8941 // the outlining process. So, simply create a new empty block to which we
8942 // uncoditionally branch from where TaskBodyCB left off
8943 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8944 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8945 /*IsFinished=*/true);
8946
8947 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8948 bool NeedsTargetTask = HasNoWait && DeviceID;
8949 if (NeedsTargetTask) {
8950 for (auto *V :
8951 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8952 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8953 RTArgs.SizesArray}) {
8955 OffloadingArraysToPrivatize.push_back(V);
8957 }
8958 }
8959 }
8960 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8961 DeviceID, OffloadingArraysToPrivatize](
8962 Function &OutlinedFn) mutable {
8963 assert(OutlinedFn.hasOneUse() &&
8964 "there must be a single user for the outlined function");
8965
8966 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8967
8968 // The first argument of StaleCI is always the thread id.
8969 // The next few arguments are the pointers to offloading arrays
8970 // if any. (see OffloadingArraysToPrivatize)
8971 // Finally, all other local values that are live-in into the outlined region
8972 // end up in a structure whose pointer is passed as the last argument. This
8973 // piece of data is passed in the "shared" field of the task structure. So,
8974 // we know we have to pass shareds to the task if the number of arguments is
8975 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8976 // thread id. Further, for safety, we assert that the number of arguments of
8977 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8978 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8979 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8980 assert((!HasShareds ||
8981 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8982 "Wrong number of arguments for StaleCI when shareds are present");
8983 int SharedArgOperandNo =
8984 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8985
8986 StructType *TaskWithPrivatesTy =
8987 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8988 StructType *PrivatesTy = nullptr;
8989
8990 if (!OffloadingArraysToPrivatize.empty())
8991 PrivatesTy =
8992 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8993
8995 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8996 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8997
8998 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8999 << "\n");
9000
9001 Builder.SetInsertPoint(StaleCI);
9002
9003 // Gather the arguments for emitting the runtime call.
9004 uint32_t SrcLocStrSize;
9005 Constant *SrcLocStr =
9007 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9008
9009 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9010 //
9011 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9012 // the DeviceID to the deferred task and also since
9013 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9014 Function *TaskAllocFn =
9015 !NeedsTargetTask
9016 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9018 OMPRTL___kmpc_omp_target_task_alloc);
9019
9020 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9021 // call.
9022 Value *ThreadID = getOrCreateThreadID(Ident);
9023
9024 // Argument - `sizeof_kmp_task_t` (TaskSize)
9025 // Tasksize refers to the size in bytes of kmp_task_t data structure
9026 // plus any other data to be passed to the target task, if any, which
9027 // is packed into a struct. kmp_task_t and the struct so created are
9028 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9029 Value *TaskSize = Builder.getInt64(
9030 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9031
9032 // Argument - `sizeof_shareds` (SharedsSize)
9033 // SharedsSize refers to the shareds array size in the kmp_task_t data
9034 // structure.
9035 Value *SharedsSize = Builder.getInt64(0);
9036 if (HasShareds) {
9037 auto *ArgStructAlloca =
9038 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9039 assert(ArgStructAlloca &&
9040 "Unable to find the alloca instruction corresponding to arguments "
9041 "for extracted function");
9042 std::optional<TypeSize> ArgAllocSize =
9043 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9044 assert(ArgAllocSize &&
9045 "Unable to determine size of arguments for extracted function");
9046 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9047 }
9048
9049 // Argument - `flags`
9050 // Task is tied iff (Flags & 1) == 1.
9051 // Task is untied iff (Flags & 1) == 0.
9052 // Task is final iff (Flags & 2) == 2.
9053 // Task is not final iff (Flags & 2) == 0.
9054 // A target task is not final and is untied.
9055 Value *Flags = Builder.getInt32(0);
9056
9057 // Emit the @__kmpc_omp_task_alloc runtime call
9058 // The runtime call returns a pointer to an area where the task captured
9059 // variables must be copied before the task is run (TaskData)
9060 CallInst *TaskData = nullptr;
9061
9062 SmallVector<llvm::Value *> TaskAllocArgs = {
9063 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9064 /*flags=*/Flags,
9065 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9066 /*task_func=*/ProxyFn};
9067
9068 if (NeedsTargetTask) {
9069 assert(DeviceID && "Expected non-empty device ID.");
9070 TaskAllocArgs.push_back(DeviceID);
9071 }
9072
9073 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9074
9075 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9076 if (HasShareds) {
9077 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9079 *this, Builder, TaskData, TaskWithPrivatesTy);
9080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9081 SharedsSize);
9082 }
9083 if (!OffloadingArraysToPrivatize.empty()) {
9084 Value *Privates =
9085 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9086 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9087 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9088 [[maybe_unused]] Type *ArrayType =
9089 getOffloadingArrayType(PtrToPrivatize);
9090 assert(ArrayType && "ArrayType cannot be nullptr");
9091
9092 Type *ElementType = PrivatesTy->getElementType(i);
9093 assert(ElementType == ArrayType &&
9094 "ElementType should match ArrayType");
9095 (void)ArrayType;
9096
9097 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9098 Builder.CreateMemCpy(
9099 Dst, Alignment, PtrToPrivatize, Alignment,
9100 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9101 }
9102 }
9103
9104 Value *DepArray = emitTaskDependencies(*this, Dependencies);
9105
9106 // ---------------------------------------------------------------
9107 // V5.2 13.8 target construct
9108 // If the nowait clause is present, execution of the target task
9109 // may be deferred. If the nowait clause is not present, the target task is
9110 // an included task.
9111 // ---------------------------------------------------------------
9112 // The above means that the lack of a nowait on the target construct
9113 // translates to '#pragma omp task if(0)'
9114 if (!NeedsTargetTask) {
9115 if (DepArray) {
9116 Function *TaskWaitFn =
9117 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9119 TaskWaitFn,
9120 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9121 /*ndeps=*/Builder.getInt32(Dependencies.size()),
9122 /*dep_list=*/DepArray,
9123 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9124 /*noalias_dep_list=*/
9126 }
9127 // Included task.
9128 Function *TaskBeginFn =
9129 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9130 Function *TaskCompleteFn =
9131 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9132 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9133 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9134 CI->setDebugLoc(StaleCI->getDebugLoc());
9135 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9136 } else if (DepArray) {
9137 // HasNoWait - meaning the task may be deferred. Call
9138 // __kmpc_omp_task_with_deps if there are dependencies,
9139 // else call __kmpc_omp_task
9140 Function *TaskFn =
9141 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9143 TaskFn,
9144 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
9145 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
9147 } else {
9148 // Emit the @__kmpc_omp_task runtime call to spawn the task
9149 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9150 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9151 }
9152
9153 StaleCI->eraseFromParent();
9154 for (Instruction *I : llvm::reverse(ToBeDeleted))
9155 I->eraseFromParent();
9156 };
9157 addOutlineInfo(std::move(OI));
9158
9159 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9160 << *(Builder.GetInsertBlock()) << "\n");
9161 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9162 << *(Builder.GetInsertBlock()->getParent()->getParent())
9163 << "\n");
9164 return Builder.saveIP();
9165}
9166
9168 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9169 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9170 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9171 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9172 if (Error Err =
9173 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9174 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9175 return Err;
9176 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9177 return Error::success();
9178}
9179
9180static void emitTargetCall(
9181 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9186 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9191 bool HasNoWait, Value *DynCGroupMem,
9192 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9193 // Generate a function call to the host fallback implementation of the target
9194 // region. This is called by the host when no offload entry was generated for
9195 // the target region and when the offloading call fails at runtime.
9196 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9198 Builder.restoreIP(IP);
9199 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9200 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9201 FallbackArgs.push_back(
9202 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9203 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9204 return Builder.saveIP();
9205 };
9206
9207 bool HasDependencies = Dependencies.size() > 0;
9208 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9209
9211
9212 auto TaskBodyCB =
9213 [&](Value *DeviceID, Value *RTLoc,
9214 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9215 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9216 // produce any.
9218 // emitKernelLaunch makes the necessary runtime call to offload the
9219 // kernel. We then outline all that code into a separate function
9220 // ('kernel_launch_function' in the pseudo code above). This function is
9221 // then called by the target task proxy function (see
9222 // '@.omp_target_task_proxy_func' in the pseudo code above)
9223 // "@.omp_target_task_proxy_func' is generated by
9224 // emitTargetTaskProxyFunction.
9225 if (OutlinedFnID && DeviceID)
9226 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9227 EmitTargetCallFallbackCB, KArgs,
9228 DeviceID, RTLoc, TargetTaskAllocaIP);
9229
9230 // We only need to do the outlining if `DeviceID` is set to avoid calling
9231 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9232 // generating the `else` branch of an `if` clause.
9233 //
9234 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9235 // In this case, we execute the host implementation directly.
9236 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9237 }());
9238
9239 OMPBuilder.Builder.restoreIP(AfterIP);
9240 return Error::success();
9241 };
9242
9243 auto &&EmitTargetCallElse =
9244 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9246 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9247 // produce any.
9249 if (RequiresOuterTargetTask) {
9250 // Arguments that are intended to be directly forwarded to an
9251 // emitKernelLaunch call are pased as nullptr, since
9252 // OutlinedFnID=nullptr results in that call not being done.
9254 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9255 /*RTLoc=*/nullptr, AllocaIP,
9256 Dependencies, EmptyRTArgs, HasNoWait);
9257 }
9258 return EmitTargetCallFallbackCB(Builder.saveIP());
9259 }());
9260
9261 Builder.restoreIP(AfterIP);
9262 return Error::success();
9263 };
9264
9265 auto &&EmitTargetCallThen =
9266 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9268 Info.HasNoWait = HasNoWait;
9269 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9270
9272 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9273 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9274 /*IsNonContiguous=*/true,
9275 /*ForEndCall=*/false))
9276 return Err;
9277
9278 SmallVector<Value *, 3> NumTeamsC;
9279 for (auto [DefaultVal, RuntimeVal] :
9280 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9281 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9282 : Builder.getInt32(DefaultVal));
9283
9284 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9285 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9286 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9287 if (Clause)
9288 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9289 /*isSigned=*/false);
9290 return Clause;
9291 };
9292 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9293 if (Clause)
9294 Result =
9295 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9296 Result, Clause)
9297 : Clause;
9298 };
9299
9300 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9301 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9302 SmallVector<Value *, 3> NumThreadsC;
9303 Value *MaxThreadsClause =
9304 RuntimeAttrs.TeamsThreadLimit.size() == 1
9305 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9306 : nullptr;
9307
9308 for (auto [TeamsVal, TargetVal] : zip_equal(
9309 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9310 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9311 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9312
9313 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9314 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9315
9316 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9317 }
9318
9319 unsigned NumTargetItems = Info.NumberOfPtrs;
9320 uint32_t SrcLocStrSize;
9321 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9322 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9323 llvm::omp::IdentFlag(0), 0);
9324
9325 Value *TripCount = RuntimeAttrs.LoopTripCount
9326 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9327 Builder.getInt64Ty(),
9328 /*isSigned=*/false)
9329 : Builder.getInt64(0);
9330
9331 // Request zero groupprivate bytes by default.
9332 if (!DynCGroupMem)
9333 DynCGroupMem = Builder.getInt32(0);
9334
9336 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9337 HasNoWait, DynCGroupMemFallback);
9338
9339 // Assume no error was returned because TaskBodyCB and
9340 // EmitTargetCallFallbackCB don't produce any.
9342 // The presence of certain clauses on the target directive require the
9343 // explicit generation of the target task.
9344 if (RequiresOuterTargetTask)
9345 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9346 RTLoc, AllocaIP, Dependencies,
9347 KArgs.RTArgs, Info.HasNoWait);
9348
9349 return OMPBuilder.emitKernelLaunch(
9350 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9351 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9352 }());
9353
9354 Builder.restoreIP(AfterIP);
9355 return Error::success();
9356 };
9357
9358 // If we don't have an ID for the target region, it means an offload entry
9359 // wasn't created. In this case we just run the host fallback directly and
9360 // ignore any potential 'if' clauses.
9361 if (!OutlinedFnID) {
9362 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9363 return;
9364 }
9365
9366 // If there's no 'if' clause, only generate the kernel launch code path.
9367 if (!IfCond) {
9368 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9369 return;
9370 }
9371
9372 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9373 EmitTargetCallElse, AllocaIP));
9374}
9375
9377 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9378 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9379 TargetRegionEntryInfo &EntryInfo,
9380 const TargetKernelDefaultAttrs &DefaultAttrs,
9381 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9382 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9385 CustomMapperCallbackTy CustomMapperCB,
9386 const SmallVector<DependData> &Dependencies, bool HasNowait,
9387 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9388
9389 if (!updateToLocation(Loc))
9390 return InsertPointTy();
9391
9392 Builder.restoreIP(CodeGenIP);
9393
9394 Function *OutlinedFn;
9395 Constant *OutlinedFnID = nullptr;
9396 // The target region is outlined into its own function. The LLVM IR for
9397 // the target region itself is generated using the callbacks CBFunc
9398 // and ArgAccessorFuncCB
9400 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9401 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9402 return Err;
9403
9404 // If we are not on the target device, then we need to generate code
9405 // to make a remote call (offload) to the previously outlined function
9406 // that represents the target region. Do that now.
9407 if (!Config.isTargetDevice())
9408 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9409 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9410 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9411 DynCGroupMemFallback);
9412 return Builder.saveIP();
9413}
9414
9415std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9416 StringRef FirstSeparator,
9417 StringRef Separator) {
9418 SmallString<128> Buffer;
9419 llvm::raw_svector_ostream OS(Buffer);
9420 StringRef Sep = FirstSeparator;
9421 for (StringRef Part : Parts) {
9422 OS << Sep << Part;
9423 Sep = Separator;
9424 }
9425 return OS.str().str();
9426}
9427
9428std::string
9430 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9431 Config.separator());
9432}
9433
9435 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9436 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9437 if (Elem.second) {
9438 assert(Elem.second->getValueType() == Ty &&
9439 "OMP internal variable has different type than requested");
9440 } else {
9441 // TODO: investigate the appropriate linkage type used for the global
9442 // variable for possibly changing that to internal or private, or maybe
9443 // create different versions of the function for different OMP internal
9444 // variables.
9445 const DataLayout &DL = M.getDataLayout();
9446 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9447 // default global AS is 1.
9448 // See double-target-call-with-declare-target.f90 and
9449 // declare-target-vars-in-target-region.f90 libomptarget
9450 // tests.
9451 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9452 : M.getTargetTriple().isAMDGPU()
9453 ? 0
9454 : DL.getDefaultGlobalsAddressSpace();
9455 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9458 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9459 Constant::getNullValue(Ty), Elem.first(),
9460 /*InsertBefore=*/nullptr,
9461 GlobalValue::NotThreadLocal, AddressSpaceVal);
9462 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9463 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9464 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9465 Elem.second = GV;
9466 }
9467
9468 return Elem.second;
9469}
9470
9471Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9472 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9473 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9474 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9475}
9476
9478 LLVMContext &Ctx = Builder.getContext();
9479 Value *Null =
9480 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9481 Value *SizeGep =
9482 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9483 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9484 return SizePtrToInt;
9485}
9486
9489 std::string VarName) {
9490 llvm::Constant *MaptypesArrayInit =
9491 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9492 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9493 M, MaptypesArrayInit->getType(),
9494 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9495 VarName);
9496 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9497 return MaptypesArrayGlobal;
9498}
9499
9501 InsertPointTy AllocaIP,
9502 unsigned NumOperands,
9503 struct MapperAllocas &MapperAllocas) {
9504 if (!updateToLocation(Loc))
9505 return;
9506
9507 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9508 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9509 Builder.restoreIP(AllocaIP);
9510 AllocaInst *ArgsBase = Builder.CreateAlloca(
9511 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9512 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9513 ".offload_ptrs");
9514 AllocaInst *ArgSizes = Builder.CreateAlloca(
9515 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9517 MapperAllocas.ArgsBase = ArgsBase;
9518 MapperAllocas.Args = Args;
9519 MapperAllocas.ArgSizes = ArgSizes;
9520}
9521
9523 Function *MapperFunc, Value *SrcLocInfo,
9524 Value *MaptypesArg, Value *MapnamesArg,
9526 int64_t DeviceID, unsigned NumOperands) {
9527 if (!updateToLocation(Loc))
9528 return;
9529
9530 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9531 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9532 Value *ArgsBaseGEP =
9533 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9534 {Builder.getInt32(0), Builder.getInt32(0)});
9535 Value *ArgsGEP =
9536 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9537 {Builder.getInt32(0), Builder.getInt32(0)});
9538 Value *ArgSizesGEP =
9539 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9540 {Builder.getInt32(0), Builder.getInt32(0)});
9541 Value *NullPtr =
9542 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9543 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9544 Builder.getInt32(NumOperands),
9545 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9546 MaptypesArg, MapnamesArg, NullPtr});
9547}
9548
9550 TargetDataRTArgs &RTArgs,
9551 TargetDataInfo &Info,
9552 bool ForEndCall) {
9553 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9554 "expected region end call to runtime only when end call is separate");
9555 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9556 auto VoidPtrTy = UnqualPtrTy;
9557 auto VoidPtrPtrTy = UnqualPtrTy;
9558 auto Int64Ty = Type::getInt64Ty(M.getContext());
9559 auto Int64PtrTy = UnqualPtrTy;
9560
9561 if (!Info.NumberOfPtrs) {
9562 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9563 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9564 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9565 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9566 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9567 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9568 return;
9569 }
9570
9571 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9572 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9573 Info.RTArgs.BasePointersArray,
9574 /*Idx0=*/0, /*Idx1=*/0);
9575 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9576 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9577 /*Idx0=*/0,
9578 /*Idx1=*/0);
9579 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9580 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9581 /*Idx0=*/0, /*Idx1=*/0);
9582 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9583 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9584 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9585 : Info.RTArgs.MapTypesArray,
9586 /*Idx0=*/0,
9587 /*Idx1=*/0);
9588
9589 // Only emit the mapper information arrays if debug information is
9590 // requested.
9591 if (!Info.EmitDebug)
9592 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9593 else
9594 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9595 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9596 /*Idx0=*/0,
9597 /*Idx1=*/0);
9598 // If there is no user-defined mapper, set the mapper array to nullptr to
9599 // avoid an unnecessary data privatization
9600 if (!Info.HasMapper)
9601 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9602 else
9603 RTArgs.MappersArray =
9604 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9605}
9606
9608 InsertPointTy CodeGenIP,
9609 MapInfosTy &CombinedInfo,
9610 TargetDataInfo &Info) {
9612 CombinedInfo.NonContigInfo;
9613
9614 // Build an array of struct descriptor_dim and then assign it to
9615 // offload_args.
9616 //
9617 // struct descriptor_dim {
9618 // uint64_t offset;
9619 // uint64_t count;
9620 // uint64_t stride
9621 // };
9622 Type *Int64Ty = Builder.getInt64Ty();
9624 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9625 "struct.descriptor_dim");
9626
9627 enum { OffsetFD = 0, CountFD, StrideFD };
9628 // We need two index variable here since the size of "Dims" is the same as
9629 // the size of Components, however, the size of offset, count, and stride is
9630 // equal to the size of base declaration that is non-contiguous.
9631 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9632 // Skip emitting ir if dimension size is 1 since it cannot be
9633 // non-contiguous.
9634 if (NonContigInfo.Dims[I] == 1)
9635 continue;
9636 Builder.restoreIP(AllocaIP);
9637 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9638 AllocaInst *DimsAddr =
9639 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9640 Builder.restoreIP(CodeGenIP);
9641 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9642 unsigned RevIdx = EE - II - 1;
9643 Value *DimsLVal = Builder.CreateInBoundsGEP(
9644 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9645 // Offset
9646 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9647 Builder.CreateAlignedStore(
9648 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9649 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9650 // Count
9651 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9652 Builder.CreateAlignedStore(
9653 NonContigInfo.Counts[L][RevIdx], CountLVal,
9654 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9655 // Stride
9656 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9657 Builder.CreateAlignedStore(
9658 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9659 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9660 }
9661 // args[I] = &dims
9662 Builder.restoreIP(CodeGenIP);
9663 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9664 DimsAddr, Builder.getPtrTy());
9665 Value *P = Builder.CreateConstInBoundsGEP2_32(
9666 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9667 Info.RTArgs.PointersArray, 0, I);
9668 Builder.CreateAlignedStore(
9669 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9670 ++L;
9671 }
9672}
9673
9674void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9675 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9676 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9677 BasicBlock *ExitBB, bool IsInit) {
9678 StringRef Prefix = IsInit ? ".init" : ".del";
9679
9680 // Evaluate if this is an array section.
9682 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9683 Value *IsArray =
9684 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9685 Value *DeleteBit = Builder.CreateAnd(
9686 MapType,
9687 Builder.getInt64(
9688 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9689 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9690 Value *DeleteCond;
9691 Value *Cond;
9692 if (IsInit) {
9693 // base != begin?
9694 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9695 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9696 DeleteCond = Builder.CreateIsNull(
9697 DeleteBit,
9698 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9699 } else {
9700 Cond = IsArray;
9701 DeleteCond = Builder.CreateIsNotNull(
9702 DeleteBit,
9703 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9704 }
9705 Cond = Builder.CreateAnd(Cond, DeleteCond);
9706 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9707
9708 emitBlock(BodyBB, MapperFn);
9709 // Get the array size by multiplying element size and element number (i.e., \p
9710 // Size).
9711 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9712 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9713 // memory allocation/deletion purpose only.
9714 Value *MapTypeArg = Builder.CreateAnd(
9715 MapType,
9716 Builder.getInt64(
9717 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9718 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9719 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9720 MapTypeArg = Builder.CreateOr(
9721 MapTypeArg,
9722 Builder.getInt64(
9723 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9724 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9725
9726 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9727 // data structure.
9728 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9729 ArraySize, MapTypeArg, MapName};
9731 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9732 OffloadingArgs);
9733}
9734
9737 llvm::Value *BeginArg)>
9738 GenMapInfoCB,
9739 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9740 SmallVector<Type *> Params;
9741 Params.emplace_back(Builder.getPtrTy());
9742 Params.emplace_back(Builder.getPtrTy());
9743 Params.emplace_back(Builder.getPtrTy());
9744 Params.emplace_back(Builder.getInt64Ty());
9745 Params.emplace_back(Builder.getInt64Ty());
9746 Params.emplace_back(Builder.getPtrTy());
9747
9748 auto *FnTy =
9749 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9750
9751 SmallString<64> TyStr;
9752 raw_svector_ostream Out(TyStr);
9753 Function *MapperFn =
9755 MapperFn->addFnAttr(Attribute::NoInline);
9756 MapperFn->addFnAttr(Attribute::NoUnwind);
9757 MapperFn->addParamAttr(0, Attribute::NoUndef);
9758 MapperFn->addParamAttr(1, Attribute::NoUndef);
9759 MapperFn->addParamAttr(2, Attribute::NoUndef);
9760 MapperFn->addParamAttr(3, Attribute::NoUndef);
9761 MapperFn->addParamAttr(4, Attribute::NoUndef);
9762 MapperFn->addParamAttr(5, Attribute::NoUndef);
9763
9764 // Start the mapper function code generation.
9765 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9766 auto SavedIP = Builder.saveIP();
9767 Builder.SetInsertPoint(EntryBB);
9768
9769 Value *MapperHandle = MapperFn->getArg(0);
9770 Value *BaseIn = MapperFn->getArg(1);
9771 Value *BeginIn = MapperFn->getArg(2);
9772 Value *Size = MapperFn->getArg(3);
9773 Value *MapType = MapperFn->getArg(4);
9774 Value *MapName = MapperFn->getArg(5);
9775
9776 // Compute the starting and end addresses of array elements.
9777 // Prepare common arguments for array initiation and deletion.
9778 // Convert the size in bytes into the number of array elements.
9779 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9780 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9781 Value *PtrBegin = BeginIn;
9782 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9783
9784 // Emit array initiation if this is an array section and \p MapType indicates
9785 // that memory allocation is required.
9786 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9787 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9788 MapType, MapName, ElementSize, HeadBB,
9789 /*IsInit=*/true);
9790
9791 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9792
9793 // Emit the loop header block.
9794 emitBlock(HeadBB, MapperFn);
9795 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9796 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9797 // Evaluate whether the initial condition is satisfied.
9798 Value *IsEmpty =
9799 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9800 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9801
9802 // Emit the loop body block.
9803 emitBlock(BodyBB, MapperFn);
9804 BasicBlock *LastBB = BodyBB;
9805 PHINode *PtrPHI =
9806 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9807 PtrPHI->addIncoming(PtrBegin, HeadBB);
9808
9809 // Get map clause information. Fill up the arrays with all mapped variables.
9810 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9811 if (!Info)
9812 return Info.takeError();
9813
9814 // Call the runtime API __tgt_mapper_num_components to get the number of
9815 // pre-existing components.
9816 Value *OffloadingArgs[] = {MapperHandle};
9817 Value *PreviousSize = createRuntimeFunctionCall(
9818 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9819 OffloadingArgs);
9820 Value *ShiftedPreviousSize =
9821 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9822
9823 // Fill up the runtime mapper handle for all components.
9824 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9825 Value *CurBaseArg = Info->BasePointers[I];
9826 Value *CurBeginArg = Info->Pointers[I];
9827 Value *CurSizeArg = Info->Sizes[I];
9828 Value *CurNameArg = Info->Names.size()
9829 ? Info->Names[I]
9830 : Constant::getNullValue(Builder.getPtrTy());
9831
9832 // Extract the MEMBER_OF field from the map type.
9833 Value *OriMapType = Builder.getInt64(
9834 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9835 Info->Types[I]));
9836 Value *MemberMapType =
9837 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9838
9839 // Combine the map type inherited from user-defined mapper with that
9840 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9841 // bits of the \a MapType, which is the input argument of the mapper
9842 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9843 // bits of MemberMapType.
9844 // [OpenMP 5.0], 1.2.6. map-type decay.
9845 // | alloc | to | from | tofrom | release | delete
9846 // ----------------------------------------------------------
9847 // alloc | alloc | alloc | alloc | alloc | release | delete
9848 // to | alloc | to | alloc | to | release | delete
9849 // from | alloc | alloc | from | from | release | delete
9850 // tofrom | alloc | to | from | tofrom | release | delete
9851 Value *LeftToFrom = Builder.CreateAnd(
9852 MapType,
9853 Builder.getInt64(
9854 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9855 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9856 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9857 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9858 BasicBlock *AllocElseBB =
9859 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9860 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9861 BasicBlock *ToElseBB =
9862 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9863 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9864 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9865 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9866 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9867 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9868 emitBlock(AllocBB, MapperFn);
9869 Value *AllocMapType = Builder.CreateAnd(
9870 MemberMapType,
9871 Builder.getInt64(
9872 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9873 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9874 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9875 Builder.CreateBr(EndBB);
9876 emitBlock(AllocElseBB, MapperFn);
9877 Value *IsTo = Builder.CreateICmpEQ(
9878 LeftToFrom,
9879 Builder.getInt64(
9880 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9881 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9882 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9883 // In case of to, clear OMP_MAP_FROM.
9884 emitBlock(ToBB, MapperFn);
9885 Value *ToMapType = Builder.CreateAnd(
9886 MemberMapType,
9887 Builder.getInt64(
9888 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9889 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9890 Builder.CreateBr(EndBB);
9891 emitBlock(ToElseBB, MapperFn);
9892 Value *IsFrom = Builder.CreateICmpEQ(
9893 LeftToFrom,
9894 Builder.getInt64(
9895 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9896 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9897 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9898 // In case of from, clear OMP_MAP_TO.
9899 emitBlock(FromBB, MapperFn);
9900 Value *FromMapType = Builder.CreateAnd(
9901 MemberMapType,
9902 Builder.getInt64(
9903 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9904 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9905 // In case of tofrom, do nothing.
9906 emitBlock(EndBB, MapperFn);
9907 LastBB = EndBB;
9908 PHINode *CurMapType =
9909 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9910 CurMapType->addIncoming(AllocMapType, AllocBB);
9911 CurMapType->addIncoming(ToMapType, ToBB);
9912 CurMapType->addIncoming(FromMapType, FromBB);
9913 CurMapType->addIncoming(MemberMapType, ToElseBB);
9914
9915 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9916 CurSizeArg, CurMapType, CurNameArg};
9917
9918 auto ChildMapperFn = CustomMapperCB(I);
9919 if (!ChildMapperFn)
9920 return ChildMapperFn.takeError();
9921 if (*ChildMapperFn) {
9922 // Call the corresponding mapper function.
9923 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9924 ->setDoesNotThrow();
9925 } else {
9926 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9927 // data structure.
9929 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9930 OffloadingArgs);
9931 }
9932 }
9933
9934 // Update the pointer to point to the next element that needs to be mapped,
9935 // and check whether we have mapped all elements.
9936 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9937 "omp.arraymap.next");
9938 PtrPHI->addIncoming(PtrNext, LastBB);
9939 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9940 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9941 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9942
9943 emitBlock(ExitBB, MapperFn);
9944 // Emit array deletion if this is an array section and \p MapType indicates
9945 // that deletion is required.
9946 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9947 MapType, MapName, ElementSize, DoneBB,
9948 /*IsInit=*/false);
9949
9950 // Emit the function exit block.
9951 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9952
9953 Builder.CreateRetVoid();
9954 Builder.restoreIP(SavedIP);
9955 return MapperFn;
9956}
9957
9959 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9960 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9961 bool IsNonContiguous,
9962 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9963
9964 // Reset the array information.
9965 Info.clearArrayInfo();
9966 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9967
9968 if (Info.NumberOfPtrs == 0)
9969 return Error::success();
9970
9971 Builder.restoreIP(AllocaIP);
9972 // Detect if we have any capture size requiring runtime evaluation of the
9973 // size so that a constant array could be eventually used.
9974 ArrayType *PointerArrayType =
9975 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9976
9977 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9978 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9979
9980 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9981 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9982 AllocaInst *MappersArray = Builder.CreateAlloca(
9983 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9984 Info.RTArgs.MappersArray = MappersArray;
9985
9986 // If we don't have any VLA types or other types that require runtime
9987 // evaluation, we can use a constant array for the map sizes, otherwise we
9988 // need to fill up the arrays as we do for the pointers.
9989 Type *Int64Ty = Builder.getInt64Ty();
9990 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9991 ConstantInt::get(Int64Ty, 0));
9992 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9993 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9994 bool IsNonContigEntry =
9995 IsNonContiguous &&
9996 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9997 CombinedInfo.Types[I] &
9998 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
9999 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10000 // descriptor_dim records), not the byte size.
10001 if (IsNonContigEntry) {
10002 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10003 "Index must be in-bounds for NON_CONTIG Dims array");
10004 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10005 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10006 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10007 continue;
10008 }
10009 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10010 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10011 ConstSizes[I] = CI;
10012 continue;
10013 }
10014 }
10015 RuntimeSizes.set(I);
10016 }
10017
10018 if (RuntimeSizes.all()) {
10019 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10020 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10021 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10022 restoreIPandDebugLoc(Builder, CodeGenIP);
10023 } else {
10024 auto *SizesArrayInit = ConstantArray::get(
10025 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10026 std::string Name = createPlatformSpecificName({"offload_sizes"});
10027 auto *SizesArrayGbl =
10028 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10029 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10030 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10031
10032 if (!RuntimeSizes.any()) {
10033 Info.RTArgs.SizesArray = SizesArrayGbl;
10034 } else {
10035 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10036 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10037 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10038 AllocaInst *Buffer = Builder.CreateAlloca(
10039 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10040 Buffer->setAlignment(OffloadSizeAlign);
10041 restoreIPandDebugLoc(Builder, CodeGenIP);
10042 Builder.CreateMemCpy(
10043 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10044 SizesArrayGbl, OffloadSizeAlign,
10045 Builder.getIntN(
10046 IndexSize,
10047 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10048
10049 Info.RTArgs.SizesArray = Buffer;
10050 }
10051 restoreIPandDebugLoc(Builder, CodeGenIP);
10052 }
10053
10054 // The map types are always constant so we don't need to generate code to
10055 // fill arrays. Instead, we create an array constant.
10057 for (auto mapFlag : CombinedInfo.Types)
10058 Mapping.push_back(
10059 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10060 mapFlag));
10061 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10062 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10063 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10064
10065 // The information types are only built if provided.
10066 if (!CombinedInfo.Names.empty()) {
10067 auto *MapNamesArrayGbl = createOffloadMapnames(
10068 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10069 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10070 Info.EmitDebug = true;
10071 } else {
10072 Info.RTArgs.MapNamesArray =
10074 Info.EmitDebug = false;
10075 }
10076
10077 // If there's a present map type modifier, it must not be applied to the end
10078 // of a region, so generate a separate map type array in that case.
10079 if (Info.separateBeginEndCalls()) {
10080 bool EndMapTypesDiffer = false;
10081 for (uint64_t &Type : Mapping) {
10082 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10083 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10084 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10085 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10086 EndMapTypesDiffer = true;
10087 }
10088 }
10089 if (EndMapTypesDiffer) {
10090 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10091 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10092 }
10093 }
10094
10095 PointerType *PtrTy = Builder.getPtrTy();
10096 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10097 Value *BPVal = CombinedInfo.BasePointers[I];
10098 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10099 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10100 0, I);
10101 Builder.CreateAlignedStore(BPVal, BP,
10102 M.getDataLayout().getPrefTypeAlign(PtrTy));
10103
10104 if (Info.requiresDevicePointerInfo()) {
10105 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10106 CodeGenIP = Builder.saveIP();
10107 Builder.restoreIP(AllocaIP);
10108 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10109 Builder.restoreIP(CodeGenIP);
10110 if (DeviceAddrCB)
10111 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10112 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10113 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10114 if (DeviceAddrCB)
10115 DeviceAddrCB(I, BP);
10116 }
10117 }
10118
10119 Value *PVal = CombinedInfo.Pointers[I];
10120 Value *P = Builder.CreateConstInBoundsGEP2_32(
10121 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10122 I);
10123 // TODO: Check alignment correct.
10124 Builder.CreateAlignedStore(PVal, P,
10125 M.getDataLayout().getPrefTypeAlign(PtrTy));
10126
10127 if (RuntimeSizes.test(I)) {
10128 Value *S = Builder.CreateConstInBoundsGEP2_32(
10129 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10130 /*Idx0=*/0,
10131 /*Idx1=*/I);
10132 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10133 Int64Ty,
10134 /*isSigned=*/true),
10135 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10136 }
10137 // Fill up the mapper array.
10138 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10139 Value *MFunc = ConstantPointerNull::get(PtrTy);
10140
10141 auto CustomMFunc = CustomMapperCB(I);
10142 if (!CustomMFunc)
10143 return CustomMFunc.takeError();
10144 if (*CustomMFunc)
10145 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10146
10147 Value *MAddr = Builder.CreateInBoundsGEP(
10148 PointerArrayType, MappersArray,
10149 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10150 Builder.CreateAlignedStore(
10151 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10152 }
10153
10154 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10155 Info.NumberOfPtrs == 0)
10156 return Error::success();
10157 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10158 return Error::success();
10159}
10160
10162 BasicBlock *CurBB = Builder.GetInsertBlock();
10163
10164 if (!CurBB || CurBB->getTerminator()) {
10165 // If there is no insert point or the previous block is already
10166 // terminated, don't touch it.
10167 } else {
10168 // Otherwise, create a fall-through branch.
10169 Builder.CreateBr(Target);
10170 }
10171
10172 Builder.ClearInsertionPoint();
10173}
10174
10176 bool IsFinished) {
10177 BasicBlock *CurBB = Builder.GetInsertBlock();
10178
10179 // Fall out of the current block (if necessary).
10180 emitBranch(BB);
10181
10182 if (IsFinished && BB->use_empty()) {
10183 BB->eraseFromParent();
10184 return;
10185 }
10186
10187 // Place the block after the current block, if possible, or else at
10188 // the end of the function.
10189 if (CurBB && CurBB->getParent())
10190 CurFn->insert(std::next(CurBB->getIterator()), BB);
10191 else
10192 CurFn->insert(CurFn->end(), BB);
10193 Builder.SetInsertPoint(BB);
10194}
10195
10197 BodyGenCallbackTy ElseGen,
10198 InsertPointTy AllocaIP) {
10199 // If the condition constant folds and can be elided, try to avoid emitting
10200 // the condition and the dead arm of the if/else.
10201 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10202 auto CondConstant = CI->getSExtValue();
10203 if (CondConstant)
10204 return ThenGen(AllocaIP, Builder.saveIP());
10205
10206 return ElseGen(AllocaIP, Builder.saveIP());
10207 }
10208
10209 Function *CurFn = Builder.GetInsertBlock()->getParent();
10210
10211 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10212 // emit the conditional branch.
10213 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10214 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10215 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10216 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10217 // Emit the 'then' code.
10218 emitBlock(ThenBlock, CurFn);
10219 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10220 return Err;
10221 emitBranch(ContBlock);
10222 // Emit the 'else' code if present.
10223 // There is no need to emit line number for unconditional branch.
10224 emitBlock(ElseBlock, CurFn);
10225 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10226 return Err;
10227 // There is no need to emit line number for unconditional branch.
10228 emitBranch(ContBlock);
10229 // Emit the continuation block for code after the if.
10230 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10231 return Error::success();
10232}
10233
10234bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10235 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10238 "Unexpected Atomic Ordering.");
10239
10240 bool Flush = false;
10242
10243 switch (AK) {
10244 case Read:
10247 FlushAO = AtomicOrdering::Acquire;
10248 Flush = true;
10249 }
10250 break;
10251 case Write:
10252 case Compare:
10253 case Update:
10256 FlushAO = AtomicOrdering::Release;
10257 Flush = true;
10258 }
10259 break;
10260 case Capture:
10261 switch (AO) {
10263 FlushAO = AtomicOrdering::Acquire;
10264 Flush = true;
10265 break;
10267 FlushAO = AtomicOrdering::Release;
10268 Flush = true;
10269 break;
10273 Flush = true;
10274 break;
10275 default:
10276 // do nothing - leave silently.
10277 break;
10278 }
10279 }
10280
10281 if (Flush) {
10282 // Currently Flush RT call still doesn't take memory_ordering, so for when
10283 // that happens, this tries to do the resolution of which atomic ordering
10284 // to use with but issue the flush call
10285 // TODO: pass `FlushAO` after memory ordering support is added
10286 (void)FlushAO;
10287 emitFlush(Loc);
10288 }
10289
10290 // for AO == AtomicOrdering::Monotonic and all other case combinations
10291 // do nothing
10292 return Flush;
10293}
10294
10298 AtomicOrdering AO, InsertPointTy AllocaIP) {
10299 if (!updateToLocation(Loc))
10300 return Loc.IP;
10301
10302 assert(X.Var->getType()->isPointerTy() &&
10303 "OMP Atomic expects a pointer to target memory");
10304 Type *XElemTy = X.ElemTy;
10305 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10306 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10307 "OMP atomic read expected a scalar type");
10308
10309 Value *XRead = nullptr;
10310
10311 if (XElemTy->isIntegerTy()) {
10312 LoadInst *XLD =
10313 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10314 XLD->setAtomic(AO);
10315 XRead = cast<Value>(XLD);
10316 } else if (XElemTy->isStructTy()) {
10317 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10318 // target does not support `atomicrmw` of the size of the struct
10319 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10320 OldVal->setAtomic(AO);
10321 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10322 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10323 OpenMPIRBuilder::AtomicInfo atomicInfo(
10324 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10325 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10326 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10327 XRead = AtomicLoadRes.first;
10328 OldVal->eraseFromParent();
10329 } else {
10330 // We need to perform atomic op as integer
10331 IntegerType *IntCastTy =
10332 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10333 LoadInst *XLoad =
10334 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10335 XLoad->setAtomic(AO);
10336 if (XElemTy->isFloatingPointTy()) {
10337 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10338 } else {
10339 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10340 }
10341 }
10342 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10343 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10344 return Builder.saveIP();
10345}
10346
10349 AtomicOpValue &X, Value *Expr,
10350 AtomicOrdering AO, InsertPointTy AllocaIP) {
10351 if (!updateToLocation(Loc))
10352 return Loc.IP;
10353
10354 assert(X.Var->getType()->isPointerTy() &&
10355 "OMP Atomic expects a pointer to target memory");
10356 Type *XElemTy = X.ElemTy;
10357 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10358 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10359 "OMP atomic write expected a scalar type");
10360
10361 if (XElemTy->isIntegerTy()) {
10362 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10363 XSt->setAtomic(AO);
10364 } else if (XElemTy->isStructTy()) {
10365 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10366 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10367 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10368 OpenMPIRBuilder::AtomicInfo atomicInfo(
10369 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10370 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10371 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10372 OldVal->eraseFromParent();
10373 } else {
10374 // We need to bitcast and perform atomic op as integers
10375 IntegerType *IntCastTy =
10376 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10377 Value *ExprCast =
10378 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10379 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10380 XSt->setAtomic(AO);
10381 }
10382
10383 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10384 return Builder.saveIP();
10385}
10386
10389 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10390 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10391 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10392 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10393 if (!updateToLocation(Loc))
10394 return Loc.IP;
10395
10396 LLVM_DEBUG({
10397 Type *XTy = X.Var->getType();
10398 assert(XTy->isPointerTy() &&
10399 "OMP Atomic expects a pointer to target memory");
10400 Type *XElemTy = X.ElemTy;
10401 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10402 XElemTy->isPointerTy()) &&
10403 "OMP atomic update expected a scalar type");
10404 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10405 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10406 "OpenMP atomic does not support LT or GT operations");
10407 });
10408
10409 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10410 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10411 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10412 if (!AtomicResult)
10413 return AtomicResult.takeError();
10414 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10415 return Builder.saveIP();
10416}
10417
10418// FIXME: Duplicating AtomicExpand
10419Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10420 AtomicRMWInst::BinOp RMWOp) {
10421 switch (RMWOp) {
10422 case AtomicRMWInst::Add:
10423 return Builder.CreateAdd(Src1, Src2);
10424 case AtomicRMWInst::Sub:
10425 return Builder.CreateSub(Src1, Src2);
10426 case AtomicRMWInst::And:
10427 return Builder.CreateAnd(Src1, Src2);
10429 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10430 case AtomicRMWInst::Or:
10431 return Builder.CreateOr(Src1, Src2);
10432 case AtomicRMWInst::Xor:
10433 return Builder.CreateXor(Src1, Src2);
10438 case AtomicRMWInst::Max:
10439 case AtomicRMWInst::Min:
10452 llvm_unreachable("Unsupported atomic update operation");
10453 }
10454 llvm_unreachable("Unsupported atomic update operation");
10455}
10456
10457Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10458 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10460 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10461 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10462 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10463 // or a complex datatype.
10464 bool emitRMWOp = false;
10465 switch (RMWOp) {
10466 case AtomicRMWInst::Add:
10467 case AtomicRMWInst::And:
10469 case AtomicRMWInst::Or:
10470 case AtomicRMWInst::Xor:
10472 emitRMWOp = XElemTy;
10473 break;
10474 case AtomicRMWInst::Sub:
10475 emitRMWOp = (IsXBinopExpr && XElemTy);
10476 break;
10477 default:
10478 emitRMWOp = false;
10479 }
10480 emitRMWOp &= XElemTy->isIntegerTy();
10481
10482 std::pair<Value *, Value *> Res;
10483 if (emitRMWOp) {
10484 AtomicRMWInst *RMWInst =
10485 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10486 if (T.isAMDGPU()) {
10487 if (IsIgnoreDenormalMode)
10488 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10489 llvm::MDNode::get(Builder.getContext(), {}));
10490 if (!IsFineGrainedMemory)
10491 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10492 llvm::MDNode::get(Builder.getContext(), {}));
10493 if (!IsRemoteMemory)
10494 RMWInst->setMetadata("amdgpu.no.remote.memory",
10495 llvm::MDNode::get(Builder.getContext(), {}));
10496 }
10497 Res.first = RMWInst;
10498 // not needed except in case of postfix captures. Generate anyway for
10499 // consistency with the else part. Will be removed with any DCE pass.
10500 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10501 if (RMWOp == AtomicRMWInst::Xchg)
10502 Res.second = Res.first;
10503 else
10504 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10505 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10506 XElemTy->isStructTy()) {
10507 LoadInst *OldVal =
10508 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10509 OldVal->setAtomic(AO);
10510 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10511 unsigned LoadSize =
10512 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10513
10514 OpenMPIRBuilder::AtomicInfo atomicInfo(
10515 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10516 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10517 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10518 BasicBlock *CurBB = Builder.GetInsertBlock();
10519 Instruction *CurBBTI = CurBB->getTerminator();
10520 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10521 BasicBlock *ExitBB =
10522 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10523 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10524 X->getName() + ".atomic.cont");
10525 ContBB->getTerminator()->eraseFromParent();
10526 Builder.restoreIP(AllocaIP);
10527 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10528 NewAtomicAddr->setName(X->getName() + "x.new.val");
10529 Builder.SetInsertPoint(ContBB);
10530 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10531 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10532 Value *OldExprVal = PHI;
10533 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10534 if (!CBResult)
10535 return CBResult.takeError();
10536 Value *Upd = *CBResult;
10537 Builder.CreateStore(Upd, NewAtomicAddr);
10540 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10541 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10542 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10543 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10544 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10545 OldVal->eraseFromParent();
10546 Res.first = OldExprVal;
10547 Res.second = Upd;
10548
10549 if (UnreachableInst *ExitTI =
10551 CurBBTI->eraseFromParent();
10552 Builder.SetInsertPoint(ExitBB);
10553 } else {
10554 Builder.SetInsertPoint(ExitTI);
10555 }
10556 } else {
10557 IntegerType *IntCastTy =
10558 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10559 LoadInst *OldVal =
10560 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10561 OldVal->setAtomic(AO);
10562 // CurBB
10563 // | /---\
10564 // ContBB |
10565 // | \---/
10566 // ExitBB
10567 BasicBlock *CurBB = Builder.GetInsertBlock();
10568 Instruction *CurBBTI = CurBB->getTerminator();
10569 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10570 BasicBlock *ExitBB =
10571 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10572 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10573 X->getName() + ".atomic.cont");
10574 ContBB->getTerminator()->eraseFromParent();
10575 Builder.restoreIP(AllocaIP);
10576 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10577 NewAtomicAddr->setName(X->getName() + "x.new.val");
10578 Builder.SetInsertPoint(ContBB);
10579 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10580 PHI->addIncoming(OldVal, CurBB);
10581 bool IsIntTy = XElemTy->isIntegerTy();
10582 Value *OldExprVal = PHI;
10583 if (!IsIntTy) {
10584 if (XElemTy->isFloatingPointTy()) {
10585 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10586 X->getName() + ".atomic.fltCast");
10587 } else {
10588 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10589 X->getName() + ".atomic.ptrCast");
10590 }
10591 }
10592
10593 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10594 if (!CBResult)
10595 return CBResult.takeError();
10596 Value *Upd = *CBResult;
10597 Builder.CreateStore(Upd, NewAtomicAddr);
10598 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10601 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10602 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10603 Result->setVolatile(VolatileX);
10604 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10605 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10606 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10607 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10608
10609 Res.first = OldExprVal;
10610 Res.second = Upd;
10611
10612 // set Insertion point in exit block
10613 if (UnreachableInst *ExitTI =
10615 CurBBTI->eraseFromParent();
10616 Builder.SetInsertPoint(ExitBB);
10617 } else {
10618 Builder.SetInsertPoint(ExitTI);
10619 }
10620 }
10621
10622 return Res;
10623}
10624
10627 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10628 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10629 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10630 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10631 if (!updateToLocation(Loc))
10632 return Loc.IP;
10633
10634 LLVM_DEBUG({
10635 Type *XTy = X.Var->getType();
10636 assert(XTy->isPointerTy() &&
10637 "OMP Atomic expects a pointer to target memory");
10638 Type *XElemTy = X.ElemTy;
10639 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10640 XElemTy->isPointerTy()) &&
10641 "OMP atomic capture expected a scalar type");
10642 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10643 "OpenMP atomic does not support LT or GT operations");
10644 });
10645
10646 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10647 // 'x' is simply atomically rewritten with 'expr'.
10648 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10649 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10650 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10651 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10652 if (!AtomicResult)
10653 return AtomicResult.takeError();
10654 Value *CapturedVal =
10655 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10656 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10657
10658 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10659 return Builder.saveIP();
10660}
10661
10665 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10666 bool IsFailOnly) {
10667
10669 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10670 IsPostfixUpdate, IsFailOnly, Failure);
10671}
10672
10676 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10677 bool IsFailOnly, AtomicOrdering Failure) {
10678
10679 if (!updateToLocation(Loc))
10680 return Loc.IP;
10681
10682 assert(X.Var->getType()->isPointerTy() &&
10683 "OMP atomic expects a pointer to target memory");
10684 // compare capture
10685 if (V.Var) {
10686 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10687 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10688 }
10689
10690 bool IsInteger = E->getType()->isIntegerTy();
10691
10692 if (Op == OMPAtomicCompareOp::EQ) {
10693 AtomicCmpXchgInst *Result = nullptr;
10694 if (!IsInteger) {
10695 IntegerType *IntCastTy =
10696 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10697 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10698 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10699 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10700 AO, Failure);
10701 } else {
10702 Result =
10703 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10704 }
10705
10706 if (V.Var) {
10707 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10708 if (!IsInteger)
10709 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10710 assert(OldValue->getType() == V.ElemTy &&
10711 "OldValue and V must be of same type");
10712 if (IsPostfixUpdate) {
10713 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10714 } else {
10715 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10716 if (IsFailOnly) {
10717 // CurBB----
10718 // | |
10719 // v |
10720 // ContBB |
10721 // | |
10722 // v |
10723 // ExitBB <-
10724 //
10725 // where ContBB only contains the store of old value to 'v'.
10726 BasicBlock *CurBB = Builder.GetInsertBlock();
10727 Instruction *CurBBTI = CurBB->getTerminator();
10728 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10729 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10730 CurBBTI, X.Var->getName() + ".atomic.exit");
10731 BasicBlock *ContBB = CurBB->splitBasicBlock(
10732 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10733 ContBB->getTerminator()->eraseFromParent();
10734 CurBB->getTerminator()->eraseFromParent();
10735
10736 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10737
10738 Builder.SetInsertPoint(ContBB);
10739 Builder.CreateStore(OldValue, V.Var);
10740 Builder.CreateBr(ExitBB);
10741
10742 if (UnreachableInst *ExitTI =
10744 CurBBTI->eraseFromParent();
10745 Builder.SetInsertPoint(ExitBB);
10746 } else {
10747 Builder.SetInsertPoint(ExitTI);
10748 }
10749 } else {
10750 Value *CapturedValue =
10751 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10752 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10753 }
10754 }
10755 }
10756 // The comparison result has to be stored.
10757 if (R.Var) {
10758 assert(R.Var->getType()->isPointerTy() &&
10759 "r.var must be of pointer type");
10760 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10761
10762 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10763 Value *ResultCast = R.IsSigned
10764 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10765 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10766 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10767 }
10768 } else {
10769 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10770 "Op should be either max or min at this point");
10771 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10772
10773 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10774 // Let's take max as example.
10775 // OpenMP form:
10776 // x = x > expr ? expr : x;
10777 // LLVM form:
10778 // *ptr = *ptr > val ? *ptr : val;
10779 // We need to transform to LLVM form.
10780 // x = x <= expr ? x : expr;
10782 if (IsXBinopExpr) {
10783 if (IsInteger) {
10784 if (X.IsSigned)
10785 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10787 else
10788 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10790 } else {
10791 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10793 }
10794 } else {
10795 if (IsInteger) {
10796 if (X.IsSigned)
10797 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10799 else
10800 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10802 } else {
10803 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10805 }
10806 }
10807
10808 AtomicRMWInst *OldValue =
10809 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10810 if (V.Var) {
10811 Value *CapturedValue = nullptr;
10812 if (IsPostfixUpdate) {
10813 CapturedValue = OldValue;
10814 } else {
10815 CmpInst::Predicate Pred;
10816 switch (NewOp) {
10817 case AtomicRMWInst::Max:
10818 Pred = CmpInst::ICMP_SGT;
10819 break;
10821 Pred = CmpInst::ICMP_UGT;
10822 break;
10824 Pred = CmpInst::FCMP_OGT;
10825 break;
10826 case AtomicRMWInst::Min:
10827 Pred = CmpInst::ICMP_SLT;
10828 break;
10830 Pred = CmpInst::ICMP_ULT;
10831 break;
10833 Pred = CmpInst::FCMP_OLT;
10834 break;
10835 default:
10836 llvm_unreachable("unexpected comparison op");
10837 }
10838 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10839 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10840 }
10841 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10842 }
10843 }
10844
10845 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10846
10847 return Builder.saveIP();
10848}
10849
10852 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10853 Value *NumTeamsUpper, Value *ThreadLimit,
10854 Value *IfExpr) {
10855 if (!updateToLocation(Loc))
10856 return InsertPointTy();
10857
10858 uint32_t SrcLocStrSize;
10859 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10860 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10861 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10862
10863 // Outer allocation basicblock is the entry block of the current function.
10864 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10865 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10866 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10867 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10868 }
10869
10870 // The current basic block is split into four basic blocks. After outlining,
10871 // they will be mapped as follows:
10872 // ```
10873 // def current_fn() {
10874 // current_basic_block:
10875 // br label %teams.exit
10876 // teams.exit:
10877 // ; instructions after teams
10878 // }
10879 //
10880 // def outlined_fn() {
10881 // teams.alloca:
10882 // br label %teams.body
10883 // teams.body:
10884 // ; instructions within teams body
10885 // }
10886 // ```
10887 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10888 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10889 BasicBlock *AllocaBB =
10890 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10891
10892 bool SubClausesPresent =
10893 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10894 // Push num_teams
10895 if (!Config.isTargetDevice() && SubClausesPresent) {
10896 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10897 "if lowerbound is non-null, then upperbound must also be non-null "
10898 "for bounds on num_teams");
10899
10900 if (NumTeamsUpper == nullptr)
10901 NumTeamsUpper = Builder.getInt32(0);
10902
10903 if (NumTeamsLower == nullptr)
10904 NumTeamsLower = NumTeamsUpper;
10905
10906 if (IfExpr) {
10907 assert(IfExpr->getType()->isIntegerTy() &&
10908 "argument to if clause must be an integer value");
10909
10910 // upper = ifexpr ? upper : 1
10911 if (IfExpr->getType() != Int1)
10912 IfExpr = Builder.CreateICmpNE(IfExpr,
10913 ConstantInt::get(IfExpr->getType(), 0));
10914 NumTeamsUpper = Builder.CreateSelect(
10915 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10916
10917 // lower = ifexpr ? lower : 1
10918 NumTeamsLower = Builder.CreateSelect(
10919 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10920 }
10921
10922 if (ThreadLimit == nullptr)
10923 ThreadLimit = Builder.getInt32(0);
10924
10925 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
10926 // truncate or sign extend the passed values to match the int32 parameters.
10927 Value *NumTeamsLowerInt32 =
10928 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
10929 Value *NumTeamsUpperInt32 =
10930 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
10931 Value *ThreadLimitInt32 =
10932 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
10933
10934 Value *ThreadNum = getOrCreateThreadID(Ident);
10935
10937 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10938 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
10939 ThreadLimitInt32});
10940 }
10941 // Generate the body of teams.
10942 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10943 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10944 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10945 return Err;
10946
10947 OutlineInfo OI;
10948 OI.EntryBB = AllocaBB;
10949 OI.ExitBB = ExitBB;
10950 OI.OuterAllocaBB = &OuterAllocaBB;
10951
10952 // Insert fake values for global tid and bound tid.
10954 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10956 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10958 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10959
10960 auto HostPostOutlineCB = [this, Ident,
10961 ToBeDeleted](Function &OutlinedFn) mutable {
10962 // The stale call instruction will be replaced with a new call instruction
10963 // for runtime call with the outlined function.
10964
10965 assert(OutlinedFn.hasOneUse() &&
10966 "there must be a single user for the outlined function");
10967 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10968 ToBeDeleted.push_back(StaleCI);
10969
10970 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10971 "Outlined function must have two or three arguments only");
10972
10973 bool HasShared = OutlinedFn.arg_size() == 3;
10974
10975 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10976 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10977 if (HasShared)
10978 OutlinedFn.getArg(2)->setName("data");
10979
10980 // Call to the runtime function for teams in the current function.
10981 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10982 "outlined function.");
10983 Builder.SetInsertPoint(StaleCI);
10984 SmallVector<Value *> Args = {
10985 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10986 if (HasShared)
10987 Args.push_back(StaleCI->getArgOperand(2));
10990 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10991 Args);
10992
10993 for (Instruction *I : llvm::reverse(ToBeDeleted))
10994 I->eraseFromParent();
10995 };
10996
10997 if (!Config.isTargetDevice())
10998 OI.PostOutlineCB = HostPostOutlineCB;
10999
11000 addOutlineInfo(std::move(OI));
11001
11002 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11003
11004 return Builder.saveIP();
11005}
11006
11009 InsertPointTy OuterAllocaIP,
11010 BodyGenCallbackTy BodyGenCB) {
11011 if (!updateToLocation(Loc))
11012 return InsertPointTy();
11013
11014 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
11015
11016 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11017 BasicBlock *BodyBB =
11018 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11019 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11020 }
11021 BasicBlock *ExitBB =
11022 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11023 BasicBlock *BodyBB =
11024 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11025 BasicBlock *AllocaBB =
11026 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11027
11028 // Generate the body of distribute clause
11029 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11030 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11031 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11032 return Err;
11033
11034 // When using target we use different runtime functions which require a
11035 // callback.
11036 if (Config.isTargetDevice()) {
11037 OutlineInfo OI;
11038 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
11039 OI.EntryBB = AllocaBB;
11040 OI.ExitBB = ExitBB;
11041
11042 addOutlineInfo(std::move(OI));
11043 }
11044 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11045
11046 return Builder.saveIP();
11047}
11048
11051 std::string VarName) {
11052 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11054 Names.size()),
11055 Names);
11056 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11057 M, MapNamesArrayInit->getType(),
11058 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11059 VarName);
11060 return MapNamesArrayGlobal;
11061}
11062
11063// Create all simple and struct types exposed by the runtime and remember
11064// the llvm::PointerTypes of them for easy access later.
11065void OpenMPIRBuilder::initializeTypes(Module &M) {
11066 LLVMContext &Ctx = M.getContext();
11067 StructType *T;
11068 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11069 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11070#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11071#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11072 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11073 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11074#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11075 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11076 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11077#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11078 T = StructType::getTypeByName(Ctx, StructName); \
11079 if (!T) \
11080 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11081 VarName = T; \
11082 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11083#include "llvm/Frontend/OpenMP/OMPKinds.def"
11084}
11085
11088 SmallVectorImpl<BasicBlock *> &BlockVector) {
11090 BlockSet.insert(EntryBB);
11091 BlockSet.insert(ExitBB);
11092
11093 Worklist.push_back(EntryBB);
11094 while (!Worklist.empty()) {
11095 BasicBlock *BB = Worklist.pop_back_val();
11096 BlockVector.push_back(BB);
11097 for (BasicBlock *SuccBB : successors(BB))
11098 if (BlockSet.insert(SuccBB).second)
11099 Worklist.push_back(SuccBB);
11100 }
11101}
11102
11104 uint64_t Size, int32_t Flags,
11106 StringRef Name) {
11107 if (!Config.isGPU()) {
11110 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11111 return;
11112 }
11113 // TODO: Add support for global variables on the device after declare target
11114 // support.
11115 Function *Fn = dyn_cast<Function>(Addr);
11116 if (!Fn)
11117 return;
11118
11119 // Add a function attribute for the kernel.
11120 Fn->addFnAttr("kernel");
11121 if (T.isAMDGCN())
11122 Fn->addFnAttr("uniform-work-group-size");
11123 Fn->addFnAttr(Attribute::MustProgress);
11124}
11125
11126// We only generate metadata for function that contain target regions.
11129
11130 // If there are no entries, we don't need to do anything.
11131 if (OffloadInfoManager.empty())
11132 return;
11133
11134 LLVMContext &C = M.getContext();
11137 16>
11138 OrderedEntries(OffloadInfoManager.size());
11139
11140 // Auxiliary methods to create metadata values and strings.
11141 auto &&GetMDInt = [this](unsigned V) {
11142 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11143 };
11144
11145 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11146
11147 // Create the offloading info metadata node.
11148 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11149 auto &&TargetRegionMetadataEmitter =
11150 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11151 const TargetRegionEntryInfo &EntryInfo,
11153 // Generate metadata for target regions. Each entry of this metadata
11154 // contains:
11155 // - Entry 0 -> Kind of this type of metadata (0).
11156 // - Entry 1 -> Device ID of the file where the entry was identified.
11157 // - Entry 2 -> File ID of the file where the entry was identified.
11158 // - Entry 3 -> Mangled name of the function where the entry was
11159 // identified.
11160 // - Entry 4 -> Line in the file where the entry was identified.
11161 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11162 // - Entry 6 -> Order the entry was created.
11163 // The first element of the metadata node is the kind.
11164 Metadata *Ops[] = {
11165 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11166 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11167 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11168 GetMDInt(E.getOrder())};
11169
11170 // Save this entry in the right position of the ordered entries array.
11171 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11172
11173 // Add metadata to the named metadata node.
11174 MD->addOperand(MDNode::get(C, Ops));
11175 };
11176
11177 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11178
11179 // Create function that emits metadata for each device global variable entry;
11180 auto &&DeviceGlobalVarMetadataEmitter =
11181 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11182 StringRef MangledName,
11184 // Generate metadata for global variables. Each entry of this metadata
11185 // contains:
11186 // - Entry 0 -> Kind of this type of metadata (1).
11187 // - Entry 1 -> Mangled name of the variable.
11188 // - Entry 2 -> Declare target kind.
11189 // - Entry 3 -> Order the entry was created.
11190 // The first element of the metadata node is the kind.
11191 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11192 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11193
11194 // Save this entry in the right position of the ordered entries array.
11195 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11196 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11197
11198 // Add metadata to the named metadata node.
11199 MD->addOperand(MDNode::get(C, Ops));
11200 };
11201
11202 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11203 DeviceGlobalVarMetadataEmitter);
11204
11205 for (const auto &E : OrderedEntries) {
11206 assert(E.first && "All ordered entries must exist!");
11207 if (const auto *CE =
11209 E.first)) {
11210 if (!CE->getID() || !CE->getAddress()) {
11211 // Do not blame the entry if the parent funtion is not emitted.
11212 TargetRegionEntryInfo EntryInfo = E.second;
11213 StringRef FnName = EntryInfo.ParentName;
11214 if (!M.getNamedValue(FnName))
11215 continue;
11216 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11217 continue;
11218 }
11219 createOffloadEntry(CE->getID(), CE->getAddress(),
11220 /*Size=*/0, CE->getFlags(),
11222 } else if (const auto *CE = dyn_cast<
11224 E.first)) {
11227 CE->getFlags());
11228 switch (Flags) {
11231 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11232 continue;
11233 if (!CE->getAddress()) {
11234 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11235 continue;
11236 }
11237 // The vaiable has no definition - no need to add the entry.
11238 if (CE->getVarSize() == 0)
11239 continue;
11240 break;
11242 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11243 (!Config.isTargetDevice() && CE->getAddress())) &&
11244 "Declaret target link address is set.");
11245 if (Config.isTargetDevice())
11246 continue;
11247 if (!CE->getAddress()) {
11249 continue;
11250 }
11251 break;
11254 if (!CE->getAddress()) {
11255 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11256 continue;
11257 }
11258 break;
11259 default:
11260 break;
11261 }
11262
11263 // Hidden or internal symbols on the device are not externally visible.
11264 // We should not attempt to register them by creating an offloading
11265 // entry. Indirect variables are handled separately on the device.
11266 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11267 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11268 (Flags !=
11270 Flags != OffloadEntriesInfoManager::
11271 OMPTargetGlobalVarEntryIndirectVTable))
11272 continue;
11273
11274 // Indirect globals need to use a special name that doesn't match the name
11275 // of the associated host global.
11277 Flags ==
11279 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11280 Flags, CE->getLinkage(), CE->getVarName());
11281 else
11282 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11283 Flags, CE->getLinkage());
11284
11285 } else {
11286 llvm_unreachable("Unsupported entry kind.");
11287 }
11288 }
11289
11290 // Emit requires directive globals to a special entry so the runtime can
11291 // register them when the device image is loaded.
11292 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11293 // entries should be redesigned to better suit this use-case.
11294 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11298 ".requires", /*Size=*/0,
11300 Config.getRequiresFlags());
11301}
11302
11305 unsigned FileID, unsigned Line, unsigned Count) {
11306 raw_svector_ostream OS(Name);
11307 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11308 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11309 if (Count)
11310 OS << "_" << Count;
11311}
11312
11314 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11315 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11317 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11318 EntryInfo.Line, NewCount);
11319}
11320
11323 vfs::FileSystem &VFS,
11324 StringRef ParentName) {
11325 sys::fs::UniqueID ID(0xdeadf17e, 0);
11326 auto FileIDInfo = CallBack();
11327 uint64_t FileID = 0;
11328 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11329 ID = Status->getUniqueID();
11330 FileID = Status->getUniqueID().getFile();
11331 } else {
11332 // If the inode ID could not be determined, create a hash value
11333 // the current file name and use that as an ID.
11334 FileID = hash_value(std::get<0>(FileIDInfo));
11335 }
11336
11337 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11338 std::get<1>(FileIDInfo));
11339}
11340
11342 unsigned Offset = 0;
11343 for (uint64_t Remain =
11344 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11346 !(Remain & 1); Remain = Remain >> 1)
11347 Offset++;
11348 return Offset;
11349}
11350
11353 // Rotate by getFlagMemberOffset() bits.
11354 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11355 << getFlagMemberOffset());
11356}
11357
11360 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11361 // If the entry is PTR_AND_OBJ but has not been marked with the special
11362 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11363 // marked as MEMBER_OF.
11364 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11366 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11369 return;
11370
11371 // Entries with ATTACH are not members-of anything. They are handled
11372 // separately by the runtime after other maps have been handled.
11373 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11375 return;
11376
11377 // Reset the placeholder value to prepare the flag for the assignment of the
11378 // proper MEMBER_OF value.
11379 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11380 Flags |= MemberOfFlag;
11381}
11382
11386 bool IsDeclaration, bool IsExternallyVisible,
11387 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11388 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11389 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11390 std::function<Constant *()> GlobalInitializer,
11391 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11392 // TODO: convert this to utilise the IRBuilder Config rather than
11393 // a passed down argument.
11394 if (OpenMPSIMD)
11395 return nullptr;
11396
11399 CaptureClause ==
11401 Config.hasRequiresUnifiedSharedMemory())) {
11402 SmallString<64> PtrName;
11403 {
11404 raw_svector_ostream OS(PtrName);
11405 OS << MangledName;
11406 if (!IsExternallyVisible)
11407 OS << format("_%x", EntryInfo.FileID);
11408 OS << "_decl_tgt_ref_ptr";
11409 }
11410
11411 Value *Ptr = M.getNamedValue(PtrName);
11412
11413 if (!Ptr) {
11414 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11415 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11416
11417 auto *GV = cast<GlobalVariable>(Ptr);
11418 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11419
11420 if (!Config.isTargetDevice()) {
11421 if (GlobalInitializer)
11422 GV->setInitializer(GlobalInitializer());
11423 else
11424 GV->setInitializer(GlobalValue);
11425 }
11426
11428 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11429 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11430 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11431 }
11432
11433 return cast<Constant>(Ptr);
11434 }
11435
11436 return nullptr;
11437}
11438
11442 bool IsDeclaration, bool IsExternallyVisible,
11443 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11444 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11445 std::vector<Triple> TargetTriple,
11446 std::function<Constant *()> GlobalInitializer,
11447 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11448 Constant *Addr) {
11450 (TargetTriple.empty() && !Config.isTargetDevice()))
11451 return;
11452
11454 StringRef VarName;
11455 int64_t VarSize;
11457
11459 CaptureClause ==
11461 !Config.hasRequiresUnifiedSharedMemory()) {
11463 VarName = MangledName;
11464 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11465
11466 if (!IsDeclaration)
11467 VarSize = divideCeil(
11468 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11469 else
11470 VarSize = 0;
11471 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11472
11473 // This is a workaround carried over from Clang which prevents undesired
11474 // optimisation of internal variables.
11475 if (Config.isTargetDevice() &&
11476 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11477 // Do not create a "ref-variable" if the original is not also available
11478 // on the host.
11479 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11480 return;
11481
11482 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11483
11484 if (!M.getNamedValue(RefName)) {
11485 Constant *AddrRef =
11486 getOrCreateInternalVariable(Addr->getType(), RefName);
11487 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11488 GvAddrRef->setConstant(true);
11489 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11490 GvAddrRef->setInitializer(Addr);
11491 GeneratedRefs.push_back(GvAddrRef);
11492 }
11493 }
11494 } else {
11497 else
11499
11500 if (Config.isTargetDevice()) {
11501 VarName = (Addr) ? Addr->getName() : "";
11502 Addr = nullptr;
11503 } else {
11505 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11506 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11507 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11508 VarName = (Addr) ? Addr->getName() : "";
11509 }
11510 VarSize = M.getDataLayout().getPointerSize();
11512 }
11513
11514 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11515 Flags, Linkage);
11516}
11517
11518/// Loads all the offload entries information from the host IR
11519/// metadata.
11521 // If we are in target mode, load the metadata from the host IR. This code has
11522 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11523
11524 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11525 if (!MD)
11526 return;
11527
11528 for (MDNode *MN : MD->operands()) {
11529 auto &&GetMDInt = [MN](unsigned Idx) {
11530 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11531 return cast<ConstantInt>(V->getValue())->getZExtValue();
11532 };
11533
11534 auto &&GetMDString = [MN](unsigned Idx) {
11535 auto *V = cast<MDString>(MN->getOperand(Idx));
11536 return V->getString();
11537 };
11538
11539 switch (GetMDInt(0)) {
11540 default:
11541 llvm_unreachable("Unexpected metadata!");
11542 break;
11543 case OffloadEntriesInfoManager::OffloadEntryInfo::
11544 OffloadingEntryInfoTargetRegion: {
11545 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11546 /*DeviceID=*/GetMDInt(1),
11547 /*FileID=*/GetMDInt(2),
11548 /*Line=*/GetMDInt(4),
11549 /*Count=*/GetMDInt(5));
11550 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11551 /*Order=*/GetMDInt(6));
11552 break;
11553 }
11554 case OffloadEntriesInfoManager::OffloadEntryInfo::
11555 OffloadingEntryInfoDeviceGlobalVar:
11556 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11557 /*MangledName=*/GetMDString(1),
11559 /*Flags=*/GetMDInt(2)),
11560 /*Order=*/GetMDInt(3));
11561 break;
11562 }
11563 }
11564}
11565
11567 StringRef HostFilePath) {
11568 if (HostFilePath.empty())
11569 return;
11570
11571 auto Buf = VFS.getBufferForFile(HostFilePath);
11572 if (std::error_code Err = Buf.getError()) {
11573 report_fatal_error(("error opening host file from host file path inside of "
11574 "OpenMPIRBuilder: " +
11575 Err.message())
11576 .c_str());
11577 }
11578
11579 LLVMContext Ctx;
11581 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11582 if (std::error_code Err = M.getError()) {
11584 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11585 .c_str());
11586 }
11587
11588 loadOffloadInfoMetadata(*M.get());
11589}
11590
11593 llvm::StringRef Name) {
11594 Builder.restoreIP(Loc.IP);
11595
11596 BasicBlock *CurBB = Builder.GetInsertBlock();
11597 assert(CurBB &&
11598 "expected a valid insertion block for creating an iterator loop");
11599 Function *F = CurBB->getParent();
11600
11601 InsertPointTy SplitIP = Builder.saveIP();
11602 if (SplitIP.getPoint() == CurBB->end())
11603 if (Instruction *Terminator = CurBB->getTerminator())
11604 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
11605
11606 BasicBlock *ContBB =
11607 splitBB(SplitIP, /*CreateBranch=*/false,
11608 Builder.getCurrentDebugLocation(), "omp.it.cont");
11609
11610 CanonicalLoopInfo *CLI =
11611 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
11612 /*PreInsertBefore=*/ContBB,
11613 /*PostInsertBefore=*/ContBB, Name);
11614
11615 // Enter loop from original block.
11616 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
11617
11618 // Remove the unconditional branch inserted by createLoopSkeleton in the body
11619 if (Instruction *T = CLI->getBody()->getTerminator())
11620 T->eraseFromParent();
11621
11622 InsertPointTy BodyIP = CLI->getBodyIP();
11623 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
11624 return Err;
11625
11626 // Body must either fallthrough to the latch or branch directly to it.
11627 if (Instruction *BodyTerminator = CLI->getBody()->getTerminator()) {
11628 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
11629 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
11631 "iterator bodygen must terminate the canonical body with an "
11632 "unconditional branch to the loop latch",
11634 }
11635 } else {
11636 // Ensure we end the loop body by jumping to the latch.
11637 Builder.SetInsertPoint(CLI->getBody());
11638 Builder.CreateBr(CLI->getLatch());
11639 }
11640
11641 // Link After -> ContBB
11642 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
11643 if (!CLI->getAfter()->getTerminator())
11644 Builder.CreateBr(ContBB);
11645
11646 return InsertPointTy{ContBB, ContBB->begin()};
11647}
11648
11649/// Mangle the parameter part of the vector function name according to
11650/// their OpenMP classification. The mangling function is defined in
11651/// section 4.5 of the AAVFABI(2021Q1).
11652static std::string mangleVectorParameters(
11654 SmallString<256> Buffer;
11655 llvm::raw_svector_ostream Out(Buffer);
11656 for (const auto &ParamAttr : ParamAttrs) {
11657 switch (ParamAttr.Kind) {
11659 Out << 'l';
11660 break;
11662 Out << 'R';
11663 break;
11665 Out << 'U';
11666 break;
11668 Out << 'L';
11669 break;
11671 Out << 'u';
11672 break;
11674 Out << 'v';
11675 break;
11676 }
11677 if (ParamAttr.HasVarStride)
11678 Out << "s" << ParamAttr.StrideOrArg;
11679 else if (ParamAttr.Kind ==
11681 ParamAttr.Kind ==
11683 ParamAttr.Kind ==
11685 ParamAttr.Kind ==
11687 // Don't print the step value if it is not present or if it is
11688 // equal to 1.
11689 if (ParamAttr.StrideOrArg < 0)
11690 Out << 'n' << -ParamAttr.StrideOrArg;
11691 else if (ParamAttr.StrideOrArg != 1)
11692 Out << ParamAttr.StrideOrArg;
11693 }
11694
11695 if (!!ParamAttr.Alignment)
11696 Out << 'a' << ParamAttr.Alignment;
11697 }
11698
11699 return std::string(Out.str());
11700}
11701
11703 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
11705 struct ISADataTy {
11706 char ISA;
11707 unsigned VecRegSize;
11708 };
11709 ISADataTy ISAData[] = {
11710 {'b', 128}, // SSE
11711 {'c', 256}, // AVX
11712 {'d', 256}, // AVX2
11713 {'e', 512}, // AVX512
11714 };
11716 switch (Branch) {
11718 Masked.push_back('N');
11719 Masked.push_back('M');
11720 break;
11722 Masked.push_back('N');
11723 break;
11725 Masked.push_back('M');
11726 break;
11727 }
11728 for (char Mask : Masked) {
11729 for (const ISADataTy &Data : ISAData) {
11731 llvm::raw_svector_ostream Out(Buffer);
11732 Out << "_ZGV" << Data.ISA << Mask;
11733 if (!VLENVal) {
11734 assert(NumElts && "Non-zero simdlen/cdtsize expected");
11735 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
11736 } else {
11737 Out << VLENVal;
11738 }
11739 Out << mangleVectorParameters(ParamAttrs);
11740 Out << '_' << Fn->getName();
11741 Fn->addFnAttr(Out.str());
11742 }
11743 }
11744}
11745
11746// Function used to add the attribute. The parameter `VLEN` is templated to
11747// allow the use of `x` when targeting scalable functions for SVE.
11748template <typename T>
11749static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
11750 char ISA, StringRef ParSeq,
11751 StringRef MangledName, bool OutputBecomesInput,
11752 llvm::Function *Fn) {
11753 SmallString<256> Buffer;
11754 llvm::raw_svector_ostream Out(Buffer);
11755 Out << Prefix << ISA << LMask << VLEN;
11756 if (OutputBecomesInput)
11757 Out << 'v';
11758 Out << ParSeq << '_' << MangledName;
11759 Fn->addFnAttr(Out.str());
11760}
11761
11762// Helper function to generate the Advanced SIMD names depending on the value
11763// of the NDS when simdlen is not present.
11764static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
11765 StringRef Prefix, char ISA,
11766 StringRef ParSeq, StringRef MangledName,
11767 bool OutputBecomesInput,
11768 llvm::Function *Fn) {
11769 switch (NDS) {
11770 case 8:
11771 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11772 OutputBecomesInput, Fn);
11773 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
11774 OutputBecomesInput, Fn);
11775 break;
11776 case 16:
11777 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11778 OutputBecomesInput, Fn);
11779 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11780 OutputBecomesInput, Fn);
11781 break;
11782 case 32:
11783 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11784 OutputBecomesInput, Fn);
11785 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11786 OutputBecomesInput, Fn);
11787 break;
11788 case 64:
11789 case 128:
11790 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11791 OutputBecomesInput, Fn);
11792 break;
11793 default:
11794 llvm_unreachable("Scalar type is too wide.");
11795 }
11796}
11797
11798/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
11800 llvm::Function *Fn, unsigned UserVLEN,
11802 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
11803 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
11804
11805 // Sort out parameter sequence.
11806 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
11807 StringRef Prefix = "_ZGV";
11808 StringRef MangledName = Fn->getName();
11809
11810 // Generate simdlen from user input (if any).
11811 if (UserVLEN) {
11812 if (ISA == 's') {
11813 // SVE generates only a masked function.
11814 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11815 OutputBecomesInput, Fn);
11816 return;
11817 }
11818
11819 switch (Branch) {
11821 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11822 OutputBecomesInput, Fn);
11823 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11824 OutputBecomesInput, Fn);
11825 break;
11827 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11828 OutputBecomesInput, Fn);
11829 break;
11831 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11832 OutputBecomesInput, Fn);
11833 break;
11834 }
11835 return;
11836 }
11837
11838 if (ISA == 's') {
11839 // SVE, section 3.4.1, item 1.
11840 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
11841 OutputBecomesInput, Fn);
11842 return;
11843 }
11844
11845 switch (Branch) {
11847 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11848 MangledName, OutputBecomesInput, Fn);
11849 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11850 MangledName, OutputBecomesInput, Fn);
11851 break;
11853 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11854 MangledName, OutputBecomesInput, Fn);
11855 break;
11857 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11858 MangledName, OutputBecomesInput, Fn);
11859 break;
11860 }
11861}
11862
11863//===----------------------------------------------------------------------===//
11864// OffloadEntriesInfoManager
11865//===----------------------------------------------------------------------===//
11866
11868 return OffloadEntriesTargetRegion.empty() &&
11869 OffloadEntriesDeviceGlobalVar.empty();
11870}
11871
11872unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11873 const TargetRegionEntryInfo &EntryInfo) const {
11874 auto It = OffloadEntriesTargetRegionCount.find(
11875 getTargetRegionEntryCountKey(EntryInfo));
11876 if (It == OffloadEntriesTargetRegionCount.end())
11877 return 0;
11878 return It->second;
11879}
11880
11881void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11882 const TargetRegionEntryInfo &EntryInfo) {
11883 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11884 EntryInfo.Count + 1;
11885}
11886
11887/// Initialize target region entry.
11889 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11890 OffloadEntriesTargetRegion[EntryInfo] =
11891 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11893 ++OffloadingEntriesNum;
11894}
11895
11897 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11899 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11900
11901 // Update the EntryInfo with the next available count for this location.
11902 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11903
11904 // If we are emitting code for a target, the entry is already initialized,
11905 // only has to be registered.
11906 if (OMPBuilder->Config.isTargetDevice()) {
11907 // This could happen if the device compilation is invoked standalone.
11908 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11909 return;
11910 }
11911 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11912 Entry.setAddress(Addr);
11913 Entry.setID(ID);
11914 Entry.setFlags(Flags);
11915 } else {
11917 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11918 return;
11919 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11920 "Target region entry already registered!");
11921 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11922 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11923 ++OffloadingEntriesNum;
11924 }
11925 incrementTargetRegionEntryInfoCount(EntryInfo);
11926}
11927
11929 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11930
11931 // Update the EntryInfo with the next available count for this location.
11932 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11933
11934 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11935 if (It == OffloadEntriesTargetRegion.end()) {
11936 return false;
11937 }
11938 // Fail if this entry is already registered.
11939 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11940 return false;
11941 return true;
11942}
11943
11945 const OffloadTargetRegionEntryInfoActTy &Action) {
11946 // Scan all target region entries and perform the provided action.
11947 for (const auto &It : OffloadEntriesTargetRegion) {
11948 Action(It.first, It.second);
11949 }
11950}
11951
11953 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11954 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11955 ++OffloadingEntriesNum;
11956}
11957
11959 StringRef VarName, Constant *Addr, int64_t VarSize,
11961 if (OMPBuilder->Config.isTargetDevice()) {
11962 // This could happen if the device compilation is invoked standalone.
11963 if (!hasDeviceGlobalVarEntryInfo(VarName))
11964 return;
11965 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11966 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11967 if (Entry.getVarSize() == 0) {
11968 Entry.setVarSize(VarSize);
11969 Entry.setLinkage(Linkage);
11970 }
11971 return;
11972 }
11973 Entry.setVarSize(VarSize);
11974 Entry.setLinkage(Linkage);
11975 Entry.setAddress(Addr);
11976 } else {
11977 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11978 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11979 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11980 "Entry not initialized!");
11981 if (Entry.getVarSize() == 0) {
11982 Entry.setVarSize(VarSize);
11983 Entry.setLinkage(Linkage);
11984 }
11985 return;
11986 }
11988 Flags ==
11990 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11991 Addr, VarSize, Flags, Linkage,
11992 VarName.str());
11993 else
11994 OffloadEntriesDeviceGlobalVar.try_emplace(
11995 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
11996 ++OffloadingEntriesNum;
11997 }
11998}
11999
12002 // Scan all target region entries and perform the provided action.
12003 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12004 Action(E.getKey(), E.getValue());
12005}
12006
12007//===----------------------------------------------------------------------===//
12008// CanonicalLoopInfo
12009//===----------------------------------------------------------------------===//
12010
12011void CanonicalLoopInfo::collectControlBlocks(
12013 // We only count those BBs as control block for which we do not need to
12014 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12015 // flow. For consistency, this also means we do not add the Body block, which
12016 // is just the entry to the body code.
12017 BBs.reserve(BBs.size() + 6);
12018 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12019}
12020
12022 assert(isValid() && "Requires a valid canonical loop");
12023 for (BasicBlock *Pred : predecessors(Header)) {
12024 if (Pred != Latch)
12025 return Pred;
12026 }
12027 llvm_unreachable("Missing preheader");
12028}
12029
12030void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12031 assert(isValid() && "Requires a valid canonical loop");
12032
12033 Instruction *CmpI = &getCond()->front();
12034 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12035 CmpI->setOperand(1, TripCount);
12036
12037#ifndef NDEBUG
12038 assertOK();
12039#endif
12040}
12041
12042void CanonicalLoopInfo::mapIndVar(
12043 llvm::function_ref<Value *(Instruction *)> Updater) {
12044 assert(isValid() && "Requires a valid canonical loop");
12045
12046 Instruction *OldIV = getIndVar();
12047
12048 // Record all uses excluding those introduced by the updater. Uses by the
12049 // CanonicalLoopInfo itself to keep track of the number of iterations are
12050 // excluded.
12051 SmallVector<Use *> ReplacableUses;
12052 for (Use &U : OldIV->uses()) {
12053 auto *User = dyn_cast<Instruction>(U.getUser());
12054 if (!User)
12055 continue;
12056 if (User->getParent() == getCond())
12057 continue;
12058 if (User->getParent() == getLatch())
12059 continue;
12060 ReplacableUses.push_back(&U);
12061 }
12062
12063 // Run the updater that may introduce new uses
12064 Value *NewIV = Updater(OldIV);
12065
12066 // Replace the old uses with the value returned by the updater.
12067 for (Use *U : ReplacableUses)
12068 U->set(NewIV);
12069
12070#ifndef NDEBUG
12071 assertOK();
12072#endif
12073}
12074
12076#ifndef NDEBUG
12077 // No constraints if this object currently does not describe a loop.
12078 if (!isValid())
12079 return;
12080
12081 BasicBlock *Preheader = getPreheader();
12082 BasicBlock *Body = getBody();
12083 BasicBlock *After = getAfter();
12084
12085 // Verify standard control-flow we use for OpenMP loops.
12086 assert(Preheader);
12087 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12088 "Preheader must terminate with unconditional branch");
12089 assert(Preheader->getSingleSuccessor() == Header &&
12090 "Preheader must jump to header");
12091
12092 assert(Header);
12093 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12094 "Header must terminate with unconditional branch");
12095 assert(Header->getSingleSuccessor() == Cond &&
12096 "Header must jump to exiting block");
12097
12098 assert(Cond);
12099 assert(Cond->getSinglePredecessor() == Header &&
12100 "Exiting block only reachable from header");
12101
12102 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12103 "Exiting block must terminate with conditional branch");
12104 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12105 "Exiting block's first successor jump to the body");
12106 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12107 "Exiting block's second successor must exit the loop");
12108
12109 assert(Body);
12110 assert(Body->getSinglePredecessor() == Cond &&
12111 "Body only reachable from exiting block");
12112 assert(!isa<PHINode>(Body->front()));
12113
12114 assert(Latch);
12115 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12116 "Latch must terminate with unconditional branch");
12117 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12118 // TODO: To support simple redirecting of the end of the body code that has
12119 // multiple; introduce another auxiliary basic block like preheader and after.
12120 assert(Latch->getSinglePredecessor() != nullptr);
12121 assert(!isa<PHINode>(Latch->front()));
12122
12123 assert(Exit);
12124 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12125 "Exit block must terminate with unconditional branch");
12126 assert(Exit->getSingleSuccessor() == After &&
12127 "Exit block must jump to after block");
12128
12129 assert(After);
12130 assert(After->getSinglePredecessor() == Exit &&
12131 "After block only reachable from exit block");
12132 assert(After->empty() || !isa<PHINode>(After->front()));
12133
12134 Instruction *IndVar = getIndVar();
12135 assert(IndVar && "Canonical induction variable not found?");
12136 assert(isa<IntegerType>(IndVar->getType()) &&
12137 "Induction variable must be an integer");
12138 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12139 "Induction variable must be a PHI in the loop header");
12140 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12141 assert(
12142 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12143 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12144
12145 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12146 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12147 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12148 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12149 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12150 ->isOne());
12151
12152 Value *TripCount = getTripCount();
12153 assert(TripCount && "Loop trip count not found?");
12154 assert(IndVar->getType() == TripCount->getType() &&
12155 "Trip count and induction variable must have the same type");
12156
12157 auto *CmpI = cast<CmpInst>(&Cond->front());
12158 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12159 "Exit condition must be a signed less-than comparison");
12160 assert(CmpI->getOperand(0) == IndVar &&
12161 "Exit condition must compare the induction variable");
12162 assert(CmpI->getOperand(1) == TripCount &&
12163 "Exit condition must compare with the trip count");
12164#endif
12165}
12166
12168 Header = nullptr;
12169 Cond = nullptr;
12170 Latch = nullptr;
12171 Exit = nullptr;
12172}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:462
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:449
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:465
bool empty() const
Definition BasicBlock.h:471
const Instruction & back() const
Definition BasicBlock.h:474
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:472
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:467
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:647
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:603
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:990
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, AffinityData Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1092
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1154
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1170
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:150
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:166
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
User * user_back()
Definition Value.h:413
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:713
bool use_empty() const
Definition Value.h:347
user_iterator user_end()
Definition Value.h:411
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
iterator_range< use_iterator > uses()
Definition Value.h:381
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:87
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:374
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:302
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...