LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
65
66#include <cstdint>
67#include <optional>
68
69#define DEBUG_TYPE "openmp-ir-builder"
70
71using namespace llvm;
72using namespace omp;
73
74static cl::opt<bool>
75 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
76 cl::desc("Use optimistic attributes describing "
77 "'as-if' properties of runtime calls."),
78 cl::init(false));
79
81 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
82 cl::desc("Factor for the unroll threshold to account for code "
83 "simplifications still taking place"),
84 cl::init(1.5));
85
86#ifndef NDEBUG
87/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
88/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
89/// an InsertPoint stores the instruction before something is inserted. For
90/// instance, if both point to the same instruction, two IRBuilders alternating
91/// creating instruction will cause the instructions to be interleaved.
94 if (!IP1.isSet() || !IP2.isSet())
95 return false;
96 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
97}
98
100 // Valid ordered/unordered and base algorithm combinations.
101 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
102 case OMPScheduleType::UnorderedStaticChunked:
103 case OMPScheduleType::UnorderedStatic:
104 case OMPScheduleType::UnorderedDynamicChunked:
105 case OMPScheduleType::UnorderedGuidedChunked:
106 case OMPScheduleType::UnorderedRuntime:
107 case OMPScheduleType::UnorderedAuto:
108 case OMPScheduleType::UnorderedTrapezoidal:
109 case OMPScheduleType::UnorderedGreedy:
110 case OMPScheduleType::UnorderedBalanced:
111 case OMPScheduleType::UnorderedGuidedIterativeChunked:
112 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
113 case OMPScheduleType::UnorderedSteal:
114 case OMPScheduleType::UnorderedStaticBalancedChunked:
115 case OMPScheduleType::UnorderedGuidedSimd:
116 case OMPScheduleType::UnorderedRuntimeSimd:
117 case OMPScheduleType::OrderedStaticChunked:
118 case OMPScheduleType::OrderedStatic:
119 case OMPScheduleType::OrderedDynamicChunked:
120 case OMPScheduleType::OrderedGuidedChunked:
121 case OMPScheduleType::OrderedRuntime:
122 case OMPScheduleType::OrderedAuto:
123 case OMPScheduleType::OrderdTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedStaticChunked:
125 case OMPScheduleType::NomergeUnorderedStatic:
126 case OMPScheduleType::NomergeUnorderedDynamicChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedChunked:
128 case OMPScheduleType::NomergeUnorderedRuntime:
129 case OMPScheduleType::NomergeUnorderedAuto:
130 case OMPScheduleType::NomergeUnorderedTrapezoidal:
131 case OMPScheduleType::NomergeUnorderedGreedy:
132 case OMPScheduleType::NomergeUnorderedBalanced:
133 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
134 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
135 case OMPScheduleType::NomergeUnorderedSteal:
136 case OMPScheduleType::NomergeOrderedStaticChunked:
137 case OMPScheduleType::NomergeOrderedStatic:
138 case OMPScheduleType::NomergeOrderedDynamicChunked:
139 case OMPScheduleType::NomergeOrderedGuidedChunked:
140 case OMPScheduleType::NomergeOrderedRuntime:
141 case OMPScheduleType::NomergeOrderedAuto:
142 case OMPScheduleType::NomergeOrderedTrapezoidal:
143 case OMPScheduleType::OrderedDistributeChunked:
144 case OMPScheduleType::OrderedDistribute:
145 break;
146 default:
147 return false;
148 }
149
150 // Must not set both monotonicity modifiers at the same time.
151 OMPScheduleType MonotonicityFlags =
152 SchedType & OMPScheduleType::MonotonicityMask;
153 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
154 return false;
155
156 return true;
157}
158#endif
159
160/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
161/// debug location to the last instruction in the specified basic block if the
162/// insert point points to the end of the block.
165 Builder.restoreIP(IP);
166 llvm::BasicBlock *BB = Builder.GetInsertBlock();
167 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
168 if (!BB->empty() && I == BB->end())
169 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
170}
171
172static bool hasGridValue(const Triple &T) {
173 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
174}
175
176static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
177 if (T.isAMDGPU()) {
178 StringRef Features =
179 Kernel->getFnAttribute("target-features").getValueAsString();
180 if (Features.count("+wavefrontsize64"))
183 }
184 if (T.isNVPTX())
186 if (T.isSPIRV())
188 llvm_unreachable("No grid value available for this architecture!");
189}
190
191/// Determine which scheduling algorithm to use, determined from schedule clause
192/// arguments.
193static OMPScheduleType
194getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
195 bool HasSimdModifier, bool HasDistScheduleChunks) {
196 // Currently, the default schedule it static.
197 switch (ClauseKind) {
198 case OMP_SCHEDULE_Default:
199 case OMP_SCHEDULE_Static:
200 return HasChunks ? OMPScheduleType::BaseStaticChunked
201 : OMPScheduleType::BaseStatic;
202 case OMP_SCHEDULE_Dynamic:
203 return OMPScheduleType::BaseDynamicChunked;
204 case OMP_SCHEDULE_Guided:
205 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
206 : OMPScheduleType::BaseGuidedChunked;
207 case OMP_SCHEDULE_Auto:
209 case OMP_SCHEDULE_Runtime:
210 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
211 : OMPScheduleType::BaseRuntime;
212 case OMP_SCHEDULE_Distribute:
213 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
214 : OMPScheduleType::BaseDistribute;
215 }
216 llvm_unreachable("unhandled schedule clause argument");
217}
218
219/// Adds ordering modifier flags to schedule type.
220static OMPScheduleType
222 bool HasOrderedClause) {
223 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
224 OMPScheduleType::None &&
225 "Must not have ordering nor monotonicity flags already set");
226
227 OMPScheduleType OrderingModifier = HasOrderedClause
228 ? OMPScheduleType::ModifierOrdered
229 : OMPScheduleType::ModifierUnordered;
230 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
231
232 // Unsupported combinations
233 if (OrderingScheduleType ==
234 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
235 return OMPScheduleType::OrderedGuidedChunked;
236 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
237 OMPScheduleType::ModifierOrdered))
238 return OMPScheduleType::OrderedRuntime;
239
240 return OrderingScheduleType;
241}
242
243/// Adds monotonicity modifier flags to schedule type.
244static OMPScheduleType
246 bool HasSimdModifier, bool HasMonotonic,
247 bool HasNonmonotonic, bool HasOrderedClause) {
248 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
249 OMPScheduleType::None &&
250 "Must not have monotonicity flags already set");
251 assert((!HasMonotonic || !HasNonmonotonic) &&
252 "Monotonic and Nonmonotonic are contradicting each other");
253
254 if (HasMonotonic) {
255 return ScheduleType | OMPScheduleType::ModifierMonotonic;
256 } else if (HasNonmonotonic) {
257 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
258 } else {
259 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
260 // If the static schedule kind is specified or if the ordered clause is
261 // specified, and if the nonmonotonic modifier is not specified, the
262 // effect is as if the monotonic modifier is specified. Otherwise, unless
263 // the monotonic modifier is specified, the effect is as if the
264 // nonmonotonic modifier is specified.
265 OMPScheduleType BaseScheduleType =
266 ScheduleType & ~OMPScheduleType::ModifierMask;
267 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
268 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
269 HasOrderedClause) {
270 // The monotonic is used by default in openmp runtime library, so no need
271 // to set it.
272 return ScheduleType;
273 } else {
274 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
275 }
276 }
277}
278
279/// Determine the schedule type using schedule and ordering clause arguments.
280static OMPScheduleType
281computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
282 bool HasSimdModifier, bool HasMonotonicModifier,
283 bool HasNonmonotonicModifier, bool HasOrderedClause,
284 bool HasDistScheduleChunks) {
286 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
287 OMPScheduleType OrderedSchedule =
288 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
290 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
291 HasNonmonotonicModifier, HasOrderedClause);
292
294 return Result;
295}
296
297/// Make \p Source branch to \p Target.
298///
299/// Handles two situations:
300/// * \p Source already has an unconditional branch.
301/// * \p Source is a degenerate block (no terminator because the BB is
302/// the current head of the IR construction).
304 if (Instruction *Term = Source->getTerminatorOrNull()) {
305 auto *Br = cast<UncondBrInst>(Term);
306 BasicBlock *Succ = Br->getSuccessor();
307 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
308 Br->setSuccessor(Target);
309 return;
310 }
311
312 auto *NewBr = UncondBrInst::Create(Target, Source);
313 NewBr->setDebugLoc(DL);
314}
315
317 bool CreateBranch, DebugLoc DL) {
318 assert(New->getFirstInsertionPt() == New->begin() &&
319 "Target BB must not have PHI nodes");
320
321 // Move instructions to new block.
322 BasicBlock *Old = IP.getBlock();
323 // If the `Old` block is empty then there are no instructions to move. But in
324 // the new debug scheme, it could have trailing debug records which will be
325 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
326 // reasons:
327 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
328 // 2. Even if `New` is not empty, the rationale to move those records to `New`
329 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
330 // assumes that `Old` is optimized out and is going away. This is not the case
331 // here. The `Old` block is still being used e.g. a branch instruction is
332 // added to it later in this function.
333 // So we call `BasicBlock::splice` only when `Old` is not empty.
334 if (!Old->empty())
335 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
336
337 if (CreateBranch) {
338 auto *NewBr = UncondBrInst::Create(New, Old);
339 NewBr->setDebugLoc(DL);
340 }
341}
342
343void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
344 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
345 BasicBlock *Old = Builder.GetInsertBlock();
346
347 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
348 if (CreateBranch)
349 Builder.SetInsertPoint(Old->getTerminator());
350 else
351 Builder.SetInsertPoint(Old);
352
353 // SetInsertPoint also updates the Builder's debug location, but we want to
354 // keep the one the Builder was configured to use.
355 Builder.SetCurrentDebugLocation(DebugLoc);
356}
357
359 DebugLoc DL, llvm::Twine Name) {
360 BasicBlock *Old = IP.getBlock();
362 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
363 Old->getParent(), Old->getNextNode());
364 spliceBB(IP, New, CreateBranch, DL);
365 New->replaceSuccessorsPhiUsesWith(Old, New);
366 return New;
367}
368
369BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
370 llvm::Twine Name) {
371 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
372 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
373 if (CreateBranch)
374 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
375 else
376 Builder.SetInsertPoint(Builder.GetInsertBlock());
377 // SetInsertPoint also updates the Builder's debug location, but we want to
378 // keep the one the Builder was configured to use.
379 Builder.SetCurrentDebugLocation(DebugLoc);
380 return New;
381}
382
383BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
384 llvm::Twine Name) {
385 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
386 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
387 if (CreateBranch)
388 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
389 else
390 Builder.SetInsertPoint(Builder.GetInsertBlock());
391 // SetInsertPoint also updates the Builder's debug location, but we want to
392 // keep the one the Builder was configured to use.
393 Builder.SetCurrentDebugLocation(DebugLoc);
394 return New;
395}
396
398 llvm::Twine Suffix) {
399 BasicBlock *Old = Builder.GetInsertBlock();
400 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
401}
402
403// This function creates a fake integer value and a fake use for the integer
404// value. It returns the fake value created. This is useful in modeling the
405// extra arguments to the outlined functions.
407 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
409 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
410 const Twine &Name = "", bool AsPtr = true,
411 bool Is64Bit = false) {
412 Builder.restoreIP(OuterAllocaIP);
413 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
414 Instruction *FakeVal;
415 AllocaInst *FakeValAddr =
416 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
417 ToBeDeleted.push_back(FakeValAddr);
418
419 if (AsPtr) {
420 FakeVal = FakeValAddr;
421 } else {
422 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
423 ToBeDeleted.push_back(FakeVal);
424 }
425
426 // Generate a fake use of this value
427 Builder.restoreIP(InnerAllocaIP);
428 Instruction *UseFakeVal;
429 if (AsPtr) {
430 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
431 } else {
432 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
433 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
434 }
435 ToBeDeleted.push_back(UseFakeVal);
436 return FakeVal;
437}
438
439//===----------------------------------------------------------------------===//
440// OpenMPIRBuilderConfig
441//===----------------------------------------------------------------------===//
442
443namespace {
445/// Values for bit flags for marking which requires clauses have been used.
446enum OpenMPOffloadingRequiresDirFlags {
447 /// flag undefined.
448 OMP_REQ_UNDEFINED = 0x000,
449 /// no requires directive present.
450 OMP_REQ_NONE = 0x001,
451 /// reverse_offload clause.
452 OMP_REQ_REVERSE_OFFLOAD = 0x002,
453 /// unified_address clause.
454 OMP_REQ_UNIFIED_ADDRESS = 0x004,
455 /// unified_shared_memory clause.
456 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
457 /// dynamic_allocators clause.
458 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
459 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
460};
461
462} // anonymous namespace
463
465 : RequiresFlags(OMP_REQ_UNDEFINED) {}
466
469 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
470 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
473 RequiresFlags(OMP_REQ_UNDEFINED) {
474 if (HasRequiresReverseOffload)
475 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
476 if (HasRequiresUnifiedAddress)
477 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
478 if (HasRequiresUnifiedSharedMemory)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 if (HasRequiresDynamicAllocators)
481 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
482}
483
485 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
486}
487
489 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
490}
491
493 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
494}
495
497 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
498}
499
501 return hasRequiresFlags() ? RequiresFlags
502 : static_cast<int64_t>(OMP_REQ_NONE);
503}
504
506 if (Value)
507 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
508 else
509 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
510}
511
513 if (Value)
514 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
515 else
516 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
517}
518
520 if (Value)
521 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
522 else
523 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
524}
525
527 if (Value)
528 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
529 else
530 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
531}
532
533//===----------------------------------------------------------------------===//
534// OpenMPIRBuilder
535//===----------------------------------------------------------------------===//
536
539 SmallVector<Value *> &ArgsVector) {
541 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
542 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
543 constexpr size_t MaxDim = 3;
544 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
545
546 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
547
548 Value *DynCGroupMemFallbackFlag =
549 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
550 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
551 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
552
553 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
554
555 Value *NumTeams3D =
556 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
557 Value *NumThreads3D =
558 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
559 for (unsigned I :
560 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
561 NumTeams3D =
562 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
563 for (unsigned I :
564 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
565 NumThreads3D =
566 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
567
568 ArgsVector = {Version,
569 PointerNum,
570 KernelArgs.RTArgs.BasePointersArray,
571 KernelArgs.RTArgs.PointersArray,
572 KernelArgs.RTArgs.SizesArray,
573 KernelArgs.RTArgs.MapTypesArray,
574 KernelArgs.RTArgs.MapNamesArray,
575 KernelArgs.RTArgs.MappersArray,
576 KernelArgs.NumIterations,
577 Flags,
578 NumTeams3D,
579 NumThreads3D,
580 KernelArgs.DynCGroupMem};
581}
582
584 LLVMContext &Ctx = Fn.getContext();
585
586 // Get the function's current attributes.
587 auto Attrs = Fn.getAttributes();
588 auto FnAttrs = Attrs.getFnAttrs();
589 auto RetAttrs = Attrs.getRetAttrs();
591 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
592 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
593
594 // Add AS to FnAS while taking special care with integer extensions.
595 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
596 bool Param = true) -> void {
597 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
598 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
599 if (HasSignExt || HasZeroExt) {
600 assert(AS.getNumAttributes() == 1 &&
601 "Currently not handling extension attr combined with others.");
602 if (Param) {
603 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else if (auto AK =
606 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
607 FnAS = FnAS.addAttribute(Ctx, AK);
608 } else {
609 FnAS = FnAS.addAttributes(Ctx, AS);
610 }
611 };
612
613#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
614#include "llvm/Frontend/OpenMP/OMPKinds.def"
615
616 // Add attributes to the function declaration.
617 switch (FnID) {
618#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
619 case Enum: \
620 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
621 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
622 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
623 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
624 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
625 break;
626#include "llvm/Frontend/OpenMP/OMPKinds.def"
627 default:
628 // Attributes are optional.
629 break;
630 }
631}
632
635 FunctionType *FnTy = nullptr;
636 Function *Fn = nullptr;
637
638 // Try to find the declation in the module first.
639 switch (FnID) {
640#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
641 case Enum: \
642 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
643 IsVarArg); \
644 Fn = M.getFunction(Str); \
645 break;
646#include "llvm/Frontend/OpenMP/OMPKinds.def"
647 }
648
649 if (!Fn) {
650 // Create a new declaration if we need one.
651 switch (FnID) {
652#define OMP_RTL(Enum, Str, ...) \
653 case Enum: \
654 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
655 break;
656#include "llvm/Frontend/OpenMP/OMPKinds.def"
657 }
658 Fn->setCallingConv(Config.getRuntimeCC());
659 // Add information if the runtime function takes a callback function
660 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
661 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
662 LLVMContext &Ctx = Fn->getContext();
663 MDBuilder MDB(Ctx);
664 // Annotate the callback behavior of the runtime function:
665 // - The callback callee is argument number 2 (microtask).
666 // - The first two arguments of the callback callee are unknown (-1).
667 // - All variadic arguments to the runtime function are passed to the
668 // callback callee.
669 Fn->addMetadata(
670 LLVMContext::MD_callback,
672 2, {-1, -1}, /* VarArgsArePassed */ true)}));
673 }
674 }
675
676 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 addAttributes(FnID, *Fn);
679
680 } else {
681 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
682 << " with type " << *Fn->getFunctionType() << "\n");
683 }
684
685 assert(Fn && "Failed to create OpenMP runtime function");
686
687 return {FnTy, Fn};
688}
689
692 if (!FiniBB) {
693 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
695 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
696 Builder.SetInsertPoint(FiniBB);
697 // FiniCB adds the branch to the exit stub.
698 if (Error Err = FiniCB(Builder.saveIP()))
699 return Err;
700 }
701 return FiniBB;
702}
703
705 BasicBlock *OtherFiniBB) {
706 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
707 if (!FiniBB) {
708 FiniBB = OtherFiniBB;
709
710 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
711 if (Error Err = FiniCB(Builder.saveIP()))
712 return Err;
713
714 return Error::success();
715 }
716
717 // Move instructions from FiniBB to the start of OtherFiniBB.
718 auto EndIt = FiniBB->end();
719 if (FiniBB->size() >= 1)
720 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
721 EndIt = Prev;
722 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
723 EndIt);
724
725 FiniBB->replaceAllUsesWith(OtherFiniBB);
726 FiniBB->eraseFromParent();
727 FiniBB = OtherFiniBB;
728 return Error::success();
729}
730
733 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
734 assert(Fn && "Failed to create OpenMP runtime function pointer");
735 return Fn;
736}
737
740 StringRef Name) {
741 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
742 Call->setCallingConv(Config.getRuntimeCC());
743 return Call;
744}
745
746void OpenMPIRBuilder::initialize() { initializeTypes(M); }
747
750 BasicBlock &EntryBlock = Function->getEntryBlock();
751 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
752
753 // Loop over blocks looking for constant allocas, skipping the entry block
754 // as any allocas there are already in the desired location.
755 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
756 Block++) {
757 for (auto Inst = Block->getReverseIterator()->begin();
758 Inst != Block->getReverseIterator()->end();) {
760 Inst++;
762 continue;
763 AllocaInst->moveBeforePreserving(MoveLocInst);
764 } else {
765 Inst++;
766 }
767 }
768 }
769}
770
773
774 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
775 // TODO: For now, we support simple static allocations, we might need to
776 // move non-static ones as well. However, this will need further analysis to
777 // move the lenght arguments as well.
779 };
780
781 for (llvm::Instruction &Inst : Block)
783 if (ShouldHoistAlloca(*AllocaInst))
784 AllocasToMove.push_back(AllocaInst);
785
786 auto InsertPoint =
787 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
788
789 for (llvm::Instruction *AllocaInst : AllocasToMove)
791}
792
794 PostDominatorTree PostDomTree(*Func);
795 for (llvm::BasicBlock &BB : *Func)
796 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
798}
799
801 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
803 SmallVector<OutlineInfo, 16> DeferredOutlines;
804 for (OutlineInfo &OI : OutlineInfos) {
805 // Skip functions that have not finalized yet; may happen with nested
806 // function generation.
807 if (Fn && OI.getFunction() != Fn) {
808 DeferredOutlines.push_back(OI);
809 continue;
810 }
811
812 ParallelRegionBlockSet.clear();
813 Blocks.clear();
814 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
815
816 Function *OuterFn = OI.getFunction();
817 CodeExtractorAnalysisCache CEAC(*OuterFn);
818 // If we generate code for the target device, we need to allocate
819 // struct for aggregate params in the device default alloca address space.
820 // OpenMP runtime requires that the params of the extracted functions are
821 // passed as zero address space pointers. This flag ensures that
822 // CodeExtractor generates correct code for extracted functions
823 // which are used by OpenMP runtime.
824 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
825 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
826 /* AggregateArgs */ true,
827 /* BlockFrequencyInfo */ nullptr,
828 /* BranchProbabilityInfo */ nullptr,
829 /* AssumptionCache */ nullptr,
830 /* AllowVarArgs */ true,
831 /* AllowAlloca */ true,
832 /* AllocaBlock*/ OI.OuterAllocaBB,
833 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
834
835 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
836 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
837 << " Exit: " << OI.ExitBB->getName() << "\n");
838 assert(Extractor.isEligible() &&
839 "Expected OpenMP outlining to be possible!");
840
841 for (auto *V : OI.ExcludeArgsFromAggregate)
842 Extractor.excludeArgFromAggregate(V);
843
844 Function *OutlinedFn =
845 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
846
847 // Forward target-cpu, target-features attributes to the outlined function.
848 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
849 if (TargetCpuAttr.isStringAttribute())
850 OutlinedFn->addFnAttr(TargetCpuAttr);
851
852 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
853 if (TargetFeaturesAttr.isStringAttribute())
854 OutlinedFn->addFnAttr(TargetFeaturesAttr);
855
856 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
857 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
858 assert(OutlinedFn->getReturnType()->isVoidTy() &&
859 "OpenMP outlined functions should not return a value!");
860
861 // For compability with the clang CG we move the outlined function after the
862 // one with the parallel region.
863 OutlinedFn->removeFromParent();
864 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
865
866 // Remove the artificial entry introduced by the extractor right away, we
867 // made our own entry block after all.
868 {
869 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
870 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
871 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
872 // Move instructions from the to-be-deleted ArtificialEntry to the entry
873 // basic block of the parallel region. CodeExtractor generates
874 // instructions to unwrap the aggregate argument and may sink
875 // allocas/bitcasts for values that are solely used in the outlined region
876 // and do not escape.
877 assert(!ArtificialEntry.empty() &&
878 "Expected instructions to add in the outlined region entry");
879 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
880 End = ArtificialEntry.rend();
881 It != End;) {
882 Instruction &I = *It;
883 It++;
884
885 if (I.isTerminator()) {
886 // Absorb any debug value that terminator may have
887 if (Instruction *TI = OI.EntryBB->getTerminatorOrNull())
888 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
889 continue;
890 }
891
892 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
893 }
894
895 OI.EntryBB->moveBefore(&ArtificialEntry);
896 ArtificialEntry.eraseFromParent();
897 }
898 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
899 assert(OutlinedFn && OutlinedFn->hasNUses(1));
900
901 // Run a user callback, e.g. to add attributes.
902 if (OI.PostOutlineCB)
903 OI.PostOutlineCB(*OutlinedFn);
904
905 if (OI.FixUpNonEntryAllocas)
907 }
908
909 // Remove work items that have been completed.
910 OutlineInfos = std::move(DeferredOutlines);
911
912 // The createTarget functions embeds user written code into
913 // the target region which may inject allocas which need to
914 // be moved to the entry block of our target or risk malformed
915 // optimisations by later passes, this is only relevant for
916 // the device pass which appears to be a little more delicate
917 // when it comes to optimisations (however, we do not block on
918 // that here, it's up to the inserter to the list to do so).
919 // This notbaly has to occur after the OutlinedInfo candidates
920 // have been extracted so we have an end product that will not
921 // be implicitly adversely affected by any raises unless
922 // intentionally appended to the list.
923 // NOTE: This only does so for ConstantData, it could be extended
924 // to ConstantExpr's with further effort, however, they should
925 // largely be folded when they get here. Extending it to runtime
926 // defined/read+writeable allocation sizes would be non-trivial
927 // (need to factor in movement of any stores to variables the
928 // allocation size depends on, as well as the usual loads,
929 // otherwise it'll yield the wrong result after movement) and
930 // likely be more suitable as an LLVM optimisation pass.
933
934 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
935 [](EmitMetadataErrorKind Kind,
936 const TargetRegionEntryInfo &EntryInfo) -> void {
937 errs() << "Error of kind: " << Kind
938 << " when emitting offload entries and metadata during "
939 "OMPIRBuilder finalization \n";
940 };
941
942 if (!OffloadInfoManager.empty())
944
945 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
946 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
947 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
948 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
949 }
950
951 IsFinalized = true;
952}
953
954bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
955
957 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
958}
959
961 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
962 auto *GV =
963 new GlobalVariable(M, I32Ty,
964 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
965 ConstantInt::get(I32Ty, Value), Name);
966 GV->setVisibility(GlobalValue::HiddenVisibility);
967
968 return GV;
969}
970
972 if (List.empty())
973 return;
974
975 // Convert List to what ConstantArray needs.
977 UsedArray.resize(List.size());
978 for (unsigned I = 0, E = List.size(); I != E; ++I)
980 cast<Constant>(&*List[I]), Builder.getPtrTy());
981
982 if (UsedArray.empty())
983 return;
984 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
985
986 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
987 ConstantArray::get(ATy, UsedArray), Name);
988
989 GV->setSection("llvm.metadata");
990}
991
994 OMPTgtExecModeFlags Mode) {
995 auto *Int8Ty = Builder.getInt8Ty();
996 auto *GVMode = new GlobalVariable(
997 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
998 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
999 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1000 return GVMode;
1001}
1002
1004 uint32_t SrcLocStrSize,
1005 IdentFlag LocFlags,
1006 unsigned Reserve2Flags) {
1007 // Enable "C-mode".
1008 LocFlags |= OMP_IDENT_FLAG_KMPC;
1009
1010 Constant *&Ident =
1011 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1012 if (!Ident) {
1013 Constant *I32Null = ConstantInt::getNullValue(Int32);
1014 Constant *IdentData[] = {I32Null,
1015 ConstantInt::get(Int32, uint32_t(LocFlags)),
1016 ConstantInt::get(Int32, Reserve2Flags),
1017 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1018
1019 size_t SrcLocStrArgIdx = 4;
1020 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1022 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1023 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1024 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1025 Constant *Initializer =
1026 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1027
1028 // Look for existing encoding of the location + flags, not needed but
1029 // minimizes the difference to the existing solution while we transition.
1030 for (GlobalVariable &GV : M.globals())
1031 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1032 if (GV.getInitializer() == Initializer)
1033 Ident = &GV;
1034
1035 if (!Ident) {
1036 auto *GV = new GlobalVariable(
1037 M, OpenMPIRBuilder::Ident,
1038 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1040 M.getDataLayout().getDefaultGlobalsAddressSpace());
1041 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1042 GV->setAlignment(Align(8));
1043 Ident = GV;
1044 }
1045 }
1046
1047 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1048}
1049
1051 uint32_t &SrcLocStrSize) {
1052 SrcLocStrSize = LocStr.size();
1053 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1054 if (!SrcLocStr) {
1055 Constant *Initializer =
1056 ConstantDataArray::getString(M.getContext(), LocStr);
1057
1058 // Look for existing encoding of the location, not needed but minimizes the
1059 // difference to the existing solution while we transition.
1060 for (GlobalVariable &GV : M.globals())
1061 if (GV.isConstant() && GV.hasInitializer() &&
1062 GV.getInitializer() == Initializer)
1063 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1064
1065 SrcLocStr = Builder.CreateGlobalString(
1066 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1067 &M);
1068 }
1069 return SrcLocStr;
1070}
1071
1073 StringRef FileName,
1074 unsigned Line, unsigned Column,
1075 uint32_t &SrcLocStrSize) {
1076 SmallString<128> Buffer;
1077 Buffer.push_back(';');
1078 Buffer.append(FileName);
1079 Buffer.push_back(';');
1080 Buffer.append(FunctionName);
1081 Buffer.push_back(';');
1082 Buffer.append(std::to_string(Line));
1083 Buffer.push_back(';');
1084 Buffer.append(std::to_string(Column));
1085 Buffer.push_back(';');
1086 Buffer.push_back(';');
1087 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1088}
1089
1090Constant *
1092 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1093 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1094}
1095
1097 uint32_t &SrcLocStrSize,
1098 Function *F) {
1099 DILocation *DIL = DL.get();
1100 if (!DIL)
1101 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1102 StringRef FileName = M.getName();
1103 if (DIFile *DIF = DIL->getFile())
1104 if (std::optional<StringRef> Source = DIF->getSource())
1105 FileName = *Source;
1106 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1107 if (Function.empty() && F)
1108 Function = F->getName();
1109 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1110 DIL->getColumn(), SrcLocStrSize);
1111}
1112
1114 uint32_t &SrcLocStrSize) {
1115 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1116 Loc.IP.getBlock()->getParent());
1117}
1118
1121 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1122 "omp_global_thread_num");
1123}
1124
1127 bool ForceSimpleCall, bool CheckCancelFlag) {
1128 if (!updateToLocation(Loc))
1129 return Loc.IP;
1130
1131 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1132 // __kmpc_barrier(loc, thread_id);
1133
1134 IdentFlag BarrierLocFlags;
1135 switch (Kind) {
1136 case OMPD_for:
1137 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1138 break;
1139 case OMPD_sections:
1140 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1141 break;
1142 case OMPD_single:
1143 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1144 break;
1145 case OMPD_barrier:
1146 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1147 break;
1148 default:
1149 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1150 break;
1151 }
1152
1153 uint32_t SrcLocStrSize;
1154 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1155 Value *Args[] = {
1156 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1157 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1158
1159 // If we are in a cancellable parallel region, barriers are cancellation
1160 // points.
1161 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1162 bool UseCancelBarrier =
1163 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1164
1166 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1167 ? OMPRTL___kmpc_cancel_barrier
1168 : OMPRTL___kmpc_barrier),
1169 Args);
1170
1171 if (UseCancelBarrier && CheckCancelFlag)
1172 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1173 return Err;
1174
1175 return Builder.saveIP();
1176}
1177
1180 Value *IfCondition,
1181 omp::Directive CanceledDirective) {
1182 if (!updateToLocation(Loc))
1183 return Loc.IP;
1184
1185 // LLVM utilities like blocks with terminators.
1186 auto *UI = Builder.CreateUnreachable();
1187
1188 Instruction *ThenTI = UI, *ElseTI = nullptr;
1189 if (IfCondition) {
1190 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1191
1192 // Even if the if condition evaluates to false, this should count as a
1193 // cancellation point
1194 Builder.SetInsertPoint(ElseTI);
1195 auto ElseIP = Builder.saveIP();
1196
1198 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1199 if (!IPOrErr)
1200 return IPOrErr;
1201 }
1202
1203 Builder.SetInsertPoint(ThenTI);
1204
1205 Value *CancelKind = nullptr;
1206 switch (CanceledDirective) {
1207#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1208 case DirectiveEnum: \
1209 CancelKind = Builder.getInt32(Value); \
1210 break;
1211#include "llvm/Frontend/OpenMP/OMPKinds.def"
1212 default:
1213 llvm_unreachable("Unknown cancel kind!");
1214 }
1215
1216 uint32_t SrcLocStrSize;
1217 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1218 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1219 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1221 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1222
1223 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1224 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1225 return Err;
1226
1227 // Update the insertion point and remove the terminator we introduced.
1228 Builder.SetInsertPoint(UI->getParent());
1229 UI->eraseFromParent();
1230
1231 return Builder.saveIP();
1232}
1233
1236 omp::Directive CanceledDirective) {
1237 if (!updateToLocation(Loc))
1238 return Loc.IP;
1239
1240 // LLVM utilities like blocks with terminators.
1241 auto *UI = Builder.CreateUnreachable();
1242 Builder.SetInsertPoint(UI);
1243
1244 Value *CancelKind = nullptr;
1245 switch (CanceledDirective) {
1246#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1247 case DirectiveEnum: \
1248 CancelKind = Builder.getInt32(Value); \
1249 break;
1250#include "llvm/Frontend/OpenMP/OMPKinds.def"
1251 default:
1252 llvm_unreachable("Unknown cancel kind!");
1253 }
1254
1255 uint32_t SrcLocStrSize;
1256 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1257 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1258 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1260 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1261
1262 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1263 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1264 return Err;
1265
1266 // Update the insertion point and remove the terminator we introduced.
1267 Builder.SetInsertPoint(UI->getParent());
1268 UI->eraseFromParent();
1269
1270 return Builder.saveIP();
1271}
1272
1274 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1275 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1276 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1277 if (!updateToLocation(Loc))
1278 return Loc.IP;
1279
1280 Builder.restoreIP(AllocaIP);
1281 auto *KernelArgsPtr =
1282 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1284
1285 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1286 llvm::Value *Arg =
1287 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1288 Builder.CreateAlignedStore(
1289 KernelArgs[I], Arg,
1290 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1291 }
1292
1293 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1294 NumThreads, HostPtr, KernelArgsPtr};
1295
1297 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1298 OffloadingArgs);
1299
1300 return Builder.saveIP();
1301}
1302
1304 const LocationDescription &Loc, Value *OutlinedFnID,
1305 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1306 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1307
1308 if (!updateToLocation(Loc))
1309 return Loc.IP;
1310
1311 // On top of the arrays that were filled up, the target offloading call
1312 // takes as arguments the device id as well as the host pointer. The host
1313 // pointer is used by the runtime library to identify the current target
1314 // region, so it only has to be unique and not necessarily point to
1315 // anything. It could be the pointer to the outlined function that
1316 // implements the target region, but we aren't using that so that the
1317 // compiler doesn't need to keep that, and could therefore inline the host
1318 // function if proven worthwhile during optimization.
1319
1320 // From this point on, we need to have an ID of the target region defined.
1321 assert(OutlinedFnID && "Invalid outlined function ID!");
1322 (void)OutlinedFnID;
1323
1324 // Return value of the runtime offloading call.
1325 Value *Return = nullptr;
1326
1327 // Arguments for the target kernel.
1328 SmallVector<Value *> ArgsVector;
1329 getKernelArgsVector(Args, Builder, ArgsVector);
1330
1331 // The target region is an outlined function launched by the runtime
1332 // via calls to __tgt_target_kernel().
1333 //
1334 // Note that on the host and CPU targets, the runtime implementation of
1335 // these calls simply call the outlined function without forking threads.
1336 // The outlined functions themselves have runtime calls to
1337 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1338 // the compiler in emitTeamsCall() and emitParallelCall().
1339 //
1340 // In contrast, on the NVPTX target, the implementation of
1341 // __tgt_target_teams() launches a GPU kernel with the requested number
1342 // of teams and threads so no additional calls to the runtime are required.
1343 // Check the error code and execute the host version if required.
1344 Builder.restoreIP(emitTargetKernel(
1345 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1346 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1347
1348 BasicBlock *OffloadFailedBlock =
1349 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1350 BasicBlock *OffloadContBlock =
1351 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1352 Value *Failed = Builder.CreateIsNotNull(Return);
1353 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1354
1355 auto CurFn = Builder.GetInsertBlock()->getParent();
1356 emitBlock(OffloadFailedBlock, CurFn);
1357 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1358 if (!AfterIP)
1359 return AfterIP.takeError();
1360 Builder.restoreIP(*AfterIP);
1361 emitBranch(OffloadContBlock);
1362 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1363 return Builder.saveIP();
1364}
1365
1367 Value *CancelFlag, omp::Directive CanceledDirective) {
1368 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1369 "Unexpected cancellation!");
1370
1371 // For a cancel barrier we create two new blocks.
1372 BasicBlock *BB = Builder.GetInsertBlock();
1373 BasicBlock *NonCancellationBlock;
1374 if (Builder.GetInsertPoint() == BB->end()) {
1375 // TODO: This branch will not be needed once we moved to the
1376 // OpenMPIRBuilder codegen completely.
1377 NonCancellationBlock = BasicBlock::Create(
1378 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1379 } else {
1380 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1382 Builder.SetInsertPoint(BB);
1383 }
1384 BasicBlock *CancellationBlock = BasicBlock::Create(
1385 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1386
1387 // Jump to them based on the return value.
1388 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1389 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1390 /* TODO weight */ nullptr, nullptr);
1391
1392 // From the cancellation block we finalize all variables and go to the
1393 // post finalization block that is known to the FiniCB callback.
1394 auto &FI = FinalizationStack.back();
1395 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1396 if (!FiniBBOrErr)
1397 return FiniBBOrErr.takeError();
1398 Builder.SetInsertPoint(CancellationBlock);
1399 Builder.CreateBr(*FiniBBOrErr);
1400
1401 // The continuation block is where code generation continues.
1402 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1403 return Error::success();
1404}
1405
1406// Callback used to create OpenMP runtime calls to support
1407// omp parallel clause for the device.
1408// We need to use this callback to replace call to the OutlinedFn in OuterFn
1409// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1411 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1412 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1413 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1414 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1415 // Add some known attributes.
1416 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1417 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1418 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1419 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1420 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1421 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1422
1423 assert(OutlinedFn.arg_size() >= 2 &&
1424 "Expected at least tid and bounded tid as arguments");
1425 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1426
1427 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1428 assert(CI && "Expected call instruction to outlined function");
1429 CI->getParent()->setName("omp_parallel");
1430
1431 Builder.SetInsertPoint(CI);
1432 Type *PtrTy = OMPIRBuilder->VoidPtr;
1433 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1434
1435 // Add alloca for kernel args
1436 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1437 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1438 AllocaInst *ArgsAlloca =
1439 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1440 Value *Args = ArgsAlloca;
1441 // Add address space cast if array for storing arguments is not allocated
1442 // in address space 0
1443 if (ArgsAlloca->getAddressSpace())
1444 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1445 Builder.restoreIP(CurrentIP);
1446
1447 // Store captured vars which are used by kmpc_parallel_60
1448 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1449 Value *V = *(CI->arg_begin() + 2 + Idx);
1450 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1451 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1452 Builder.CreateStore(V, StoreAddress);
1453 }
1454
1455 Value *Cond =
1456 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1457 : Builder.getInt32(1);
1458
1459 // Build kmpc_parallel_60 call
1460 Value *Parallel60CallArgs[] = {
1461 /* identifier*/ Ident,
1462 /* global thread num*/ ThreadID,
1463 /* if expression */ Cond,
1464 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1465 /* Proc bind */ Builder.getInt32(-1),
1466 /* outlined function */ &OutlinedFn,
1467 /* wrapper function */ NullPtrValue,
1468 /* arguments of the outlined funciton*/ Args,
1469 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1470 /* strict for number of threads */ Builder.getInt32(0)};
1471
1472 FunctionCallee RTLFn =
1473 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1474
1475 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1476
1477 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1478 << *Builder.GetInsertBlock()->getParent() << "\n");
1479
1480 // Initialize the local TID stack location with the argument value.
1481 Builder.SetInsertPoint(PrivTID);
1482 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1483 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1484 PrivTIDAddr);
1485
1486 // Remove redundant call to the outlined function.
1487 CI->eraseFromParent();
1488
1489 for (Instruction *I : ToBeDeleted) {
1490 I->eraseFromParent();
1491 }
1492}
1493
1494// Callback used to create OpenMP runtime calls to support
1495// omp parallel clause for the host.
1496// We need to use this callback to replace call to the OutlinedFn in OuterFn
1497// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1498static void
1500 Function *OuterFn, Value *Ident, Value *IfCondition,
1501 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1502 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1503 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1504 FunctionCallee RTLFn;
1505 if (IfCondition) {
1506 RTLFn =
1507 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1508 } else {
1509 RTLFn =
1510 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1511 }
1512 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1513 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1514 LLVMContext &Ctx = F->getContext();
1515 MDBuilder MDB(Ctx);
1516 // Annotate the callback behavior of the __kmpc_fork_call:
1517 // - The callback callee is argument number 2 (microtask).
1518 // - The first two arguments of the callback callee are unknown (-1).
1519 // - All variadic arguments to the __kmpc_fork_call are passed to the
1520 // callback callee.
1521 F->addMetadata(LLVMContext::MD_callback,
1523 2, {-1, -1},
1524 /* VarArgsArePassed */ true)}));
1525 }
1526 }
1527 // Add some known attributes.
1528 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1529 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1530 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1531
1532 assert(OutlinedFn.arg_size() >= 2 &&
1533 "Expected at least tid and bounded tid as arguments");
1534 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1535
1536 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1537 CI->getParent()->setName("omp_parallel");
1538 Builder.SetInsertPoint(CI);
1539
1540 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1541 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1542 &OutlinedFn};
1543
1544 SmallVector<Value *, 16> RealArgs;
1545 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1546 if (IfCondition) {
1547 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1548 RealArgs.push_back(Cond);
1549 }
1550 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1551
1552 // __kmpc_fork_call_if always expects a void ptr as the last argument
1553 // If there are no arguments, pass a null pointer.
1554 auto PtrTy = OMPIRBuilder->VoidPtr;
1555 if (IfCondition && NumCapturedVars == 0) {
1556 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1557 RealArgs.push_back(NullPtrValue);
1558 }
1559
1560 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1561
1562 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1563 << *Builder.GetInsertBlock()->getParent() << "\n");
1564
1565 // Initialize the local TID stack location with the argument value.
1566 Builder.SetInsertPoint(PrivTID);
1567 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1568 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1569 PrivTIDAddr);
1570
1571 // Remove redundant call to the outlined function.
1572 CI->eraseFromParent();
1573
1574 for (Instruction *I : ToBeDeleted) {
1575 I->eraseFromParent();
1576 }
1577}
1578
1580 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1581 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1582 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1583 omp::ProcBindKind ProcBind, bool IsCancellable) {
1584 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1585
1586 if (!updateToLocation(Loc))
1587 return Loc.IP;
1588
1589 uint32_t SrcLocStrSize;
1590 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1591 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1592 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1593 (ProcBind != OMP_PROC_BIND_default);
1594 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1595 // If we generate code for the target device, we need to allocate
1596 // struct for aggregate params in the device default alloca address space.
1597 // OpenMP runtime requires that the params of the extracted functions are
1598 // passed as zero address space pointers. This flag ensures that extracted
1599 // function arguments are declared in zero address space
1600 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1601
1602 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1603 // only if we compile for host side.
1604 if (NumThreads && !Config.isTargetDevice()) {
1605 Value *Args[] = {
1606 Ident, ThreadID,
1607 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1609 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1610 }
1611
1612 if (ProcBind != OMP_PROC_BIND_default) {
1613 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1614 Value *Args[] = {
1615 Ident, ThreadID,
1616 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1618 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1619 }
1620
1621 BasicBlock *InsertBB = Builder.GetInsertBlock();
1622 Function *OuterFn = InsertBB->getParent();
1623
1624 // Save the outer alloca block because the insertion iterator may get
1625 // invalidated and we still need this later.
1626 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1627
1628 // Vector to remember instructions we used only during the modeling but which
1629 // we want to delete at the end.
1631
1632 // Change the location to the outer alloca insertion point to create and
1633 // initialize the allocas we pass into the parallel region.
1634 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1635 Builder.restoreIP(NewOuter);
1636 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1637 AllocaInst *ZeroAddrAlloca =
1638 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1639 Instruction *TIDAddr = TIDAddrAlloca;
1640 Instruction *ZeroAddr = ZeroAddrAlloca;
1641 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1642 // Add additional casts to enforce pointers in zero address space
1643 TIDAddr = new AddrSpaceCastInst(
1644 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1645 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1646 ToBeDeleted.push_back(TIDAddr);
1647 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1648 PointerType ::get(M.getContext(), 0),
1649 "zero.addr.ascast");
1650 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1651 ToBeDeleted.push_back(ZeroAddr);
1652 }
1653
1654 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1655 // associated arguments in the outlined function, so we delete them later.
1656 ToBeDeleted.push_back(TIDAddrAlloca);
1657 ToBeDeleted.push_back(ZeroAddrAlloca);
1658
1659 // Create an artificial insertion point that will also ensure the blocks we
1660 // are about to split are not degenerated.
1661 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1662
1663 BasicBlock *EntryBB = UI->getParent();
1664 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1665 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1666 BasicBlock *PRegPreFiniBB =
1667 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1668 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1669
1670 auto FiniCBWrapper = [&](InsertPointTy IP) {
1671 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1672 // target to the region exit block.
1673 if (IP.getBlock()->end() == IP.getPoint()) {
1675 Builder.restoreIP(IP);
1676 Instruction *I = Builder.CreateBr(PRegExitBB);
1677 IP = InsertPointTy(I->getParent(), I->getIterator());
1678 }
1679 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1680 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1681 "Unexpected insertion point for finalization call!");
1682 return FiniCB(IP);
1683 };
1684
1685 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1686
1687 // Generate the privatization allocas in the block that will become the entry
1688 // of the outlined function.
1689 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1690 InsertPointTy InnerAllocaIP = Builder.saveIP();
1691
1692 AllocaInst *PrivTIDAddr =
1693 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1694 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1695
1696 // Add some fake uses for OpenMP provided arguments.
1697 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1698 Instruction *ZeroAddrUse =
1699 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1700 ToBeDeleted.push_back(ZeroAddrUse);
1701
1702 // EntryBB
1703 // |
1704 // V
1705 // PRegionEntryBB <- Privatization allocas are placed here.
1706 // |
1707 // V
1708 // PRegionBodyBB <- BodeGen is invoked here.
1709 // |
1710 // V
1711 // PRegPreFiniBB <- The block we will start finalization from.
1712 // |
1713 // V
1714 // PRegionExitBB <- A common exit to simplify block collection.
1715 //
1716
1717 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1718
1719 // Let the caller create the body.
1720 assert(BodyGenCB && "Expected body generation callback!");
1721 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1722 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1723 return Err;
1724
1725 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1726
1727 OutlineInfo OI;
1728 if (Config.isTargetDevice()) {
1729 // Generate OpenMP target specific runtime call
1730 OI.PostOutlineCB = [=, ToBeDeletedVec =
1731 std::move(ToBeDeleted)](Function &OutlinedFn) {
1732 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1733 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1734 ThreadID, ToBeDeletedVec);
1735 };
1736 OI.FixUpNonEntryAllocas = true;
1737 } else {
1738 // Generate OpenMP host runtime call
1739 OI.PostOutlineCB = [=, ToBeDeletedVec =
1740 std::move(ToBeDeleted)](Function &OutlinedFn) {
1741 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1742 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1743 };
1744 OI.FixUpNonEntryAllocas = true;
1745 }
1746
1747 OI.OuterAllocaBB = OuterAllocaBlock;
1748 OI.EntryBB = PRegEntryBB;
1749 OI.ExitBB = PRegExitBB;
1750
1751 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1753 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1754
1755 CodeExtractorAnalysisCache CEAC(*OuterFn);
1756 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1757 /* AggregateArgs */ false,
1758 /* BlockFrequencyInfo */ nullptr,
1759 /* BranchProbabilityInfo */ nullptr,
1760 /* AssumptionCache */ nullptr,
1761 /* AllowVarArgs */ true,
1762 /* AllowAlloca */ true,
1763 /* AllocationBlock */ OuterAllocaBlock,
1764 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1765
1766 // Find inputs to, outputs from the code region.
1767 BasicBlock *CommonExit = nullptr;
1768 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1769 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1770
1771 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1772 /*CollectGlobalInputs=*/true);
1773
1774 Inputs.remove_if([&](Value *I) {
1776 return GV->getValueType() == OpenMPIRBuilder::Ident;
1777
1778 return false;
1779 });
1780
1781 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1782
1783 FunctionCallee TIDRTLFn =
1784 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1785
1786 auto PrivHelper = [&](Value &V) -> Error {
1787 if (&V == TIDAddr || &V == ZeroAddr) {
1789 return Error::success();
1790 }
1791
1793 for (Use &U : V.uses())
1794 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1795 if (ParallelRegionBlockSet.count(UserI->getParent()))
1796 Uses.insert(&U);
1797
1798 // __kmpc_fork_call expects extra arguments as pointers. If the input
1799 // already has a pointer type, everything is fine. Otherwise, store the
1800 // value onto stack and load it back inside the to-be-outlined region. This
1801 // will ensure only the pointer will be passed to the function.
1802 // FIXME: if there are more than 15 trailing arguments, they must be
1803 // additionally packed in a struct.
1804 Value *Inner = &V;
1805 if (!V.getType()->isPointerTy()) {
1807 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1808
1809 Builder.restoreIP(OuterAllocaIP);
1810 Value *Ptr =
1811 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1812
1813 // Store to stack at end of the block that currently branches to the entry
1814 // block of the to-be-outlined region.
1815 Builder.SetInsertPoint(InsertBB,
1816 InsertBB->getTerminator()->getIterator());
1817 Builder.CreateStore(&V, Ptr);
1818
1819 // Load back next to allocations in the to-be-outlined region.
1820 Builder.restoreIP(InnerAllocaIP);
1821 Inner = Builder.CreateLoad(V.getType(), Ptr);
1822 }
1823
1824 Value *ReplacementValue = nullptr;
1825 CallInst *CI = dyn_cast<CallInst>(&V);
1826 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1827 ReplacementValue = PrivTID;
1828 } else {
1829 InsertPointOrErrorTy AfterIP =
1830 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1831 if (!AfterIP)
1832 return AfterIP.takeError();
1833 Builder.restoreIP(*AfterIP);
1834 InnerAllocaIP = {
1835 InnerAllocaIP.getBlock(),
1836 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1837
1838 assert(ReplacementValue &&
1839 "Expected copy/create callback to set replacement value!");
1840 if (ReplacementValue == &V)
1841 return Error::success();
1842 }
1843
1844 for (Use *UPtr : Uses)
1845 UPtr->set(ReplacementValue);
1846
1847 return Error::success();
1848 };
1849
1850 // Reset the inner alloca insertion as it will be used for loading the values
1851 // wrapped into pointers before passing them into the to-be-outlined region.
1852 // Configure it to insert immediately after the fake use of zero address so
1853 // that they are available in the generated body and so that the
1854 // OpenMP-related values (thread ID and zero address pointers) remain leading
1855 // in the argument list.
1856 InnerAllocaIP = IRBuilder<>::InsertPoint(
1857 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1858
1859 // Reset the outer alloca insertion point to the entry of the relevant block
1860 // in case it was invalidated.
1861 OuterAllocaIP = IRBuilder<>::InsertPoint(
1862 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1863
1864 for (Value *Input : Inputs) {
1865 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1866 if (Error Err = PrivHelper(*Input))
1867 return Err;
1868 }
1869 LLVM_DEBUG({
1870 for (Value *Output : Outputs)
1871 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1872 });
1873 assert(Outputs.empty() &&
1874 "OpenMP outlining should not produce live-out values!");
1875
1876 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1877 LLVM_DEBUG({
1878 for (auto *BB : Blocks)
1879 dbgs() << " PBR: " << BB->getName() << "\n";
1880 });
1881
1882 // Adjust the finalization stack, verify the adjustment, and call the
1883 // finalize function a last time to finalize values between the pre-fini
1884 // block and the exit block if we left the parallel "the normal way".
1885 auto FiniInfo = FinalizationStack.pop_back_val();
1886 (void)FiniInfo;
1887 assert(FiniInfo.DK == OMPD_parallel &&
1888 "Unexpected finalization stack state!");
1889
1890 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1891
1892 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1893 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1894 if (!FiniBBOrErr)
1895 return FiniBBOrErr.takeError();
1896 {
1898 Builder.restoreIP(PreFiniIP);
1899 Builder.CreateBr(*FiniBBOrErr);
1900 // There's currently a branch to omp.par.exit. Delete it. We will get there
1901 // via the fini block
1902 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1903 Term->eraseFromParent();
1904 }
1905
1906 // Register the outlined info.
1907 addOutlineInfo(std::move(OI));
1908
1909 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1910 UI->eraseFromParent();
1911
1912 return AfterIP;
1913}
1914
1916 // Build call void __kmpc_flush(ident_t *loc)
1917 uint32_t SrcLocStrSize;
1918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1919 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1920
1922 Args);
1923}
1924
1926 if (!updateToLocation(Loc))
1927 return;
1928 emitFlush(Loc);
1929}
1930
1932 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1933 // global_tid);
1934 uint32_t SrcLocStrSize;
1935 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1936 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1937 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1938
1939 // Ignore return result until untied tasks are supported.
1941 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1942}
1943
1949
1951 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1952 uint32_t SrcLocStrSize;
1953 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1954 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1955 Constant *I32Null = ConstantInt::getNullValue(Int32);
1956 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1957
1959 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1960}
1961
1967
1968// Processes the dependencies in Dependencies and does the following
1969// - Allocates space on the stack of an array of DependInfo objects
1970// - Populates each DependInfo object with relevant information of
1971// the corresponding dependence.
1972// - All code is inserted in the entry block of the current function.
1974 OpenMPIRBuilder &OMPBuilder,
1976 // Early return if we have no dependencies to process
1977 if (Dependencies.empty())
1978 return nullptr;
1979
1980 // Given a vector of DependData objects, in this function we create an
1981 // array on the stack that holds kmp_dep_info objects corresponding
1982 // to each dependency. This is then passed to the OpenMP runtime.
1983 // For example, if there are 'n' dependencies then the following psedo
1984 // code is generated. Assume the first dependence is on a variable 'a'
1985 //
1986 // \code{c}
1987 // DepArray = alloc(n x sizeof(kmp_depend_info);
1988 // idx = 0;
1989 // DepArray[idx].base_addr = ptrtoint(&a);
1990 // DepArray[idx].len = 8;
1991 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1992 // ++idx;
1993 // DepArray[idx].base_addr = ...;
1994 // \endcode
1995
1996 IRBuilderBase &Builder = OMPBuilder.Builder;
1997 Type *DependInfo = OMPBuilder.DependInfo;
1998 Module &M = OMPBuilder.M;
1999
2000 Value *DepArray = nullptr;
2001 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2002 Builder.SetInsertPoint(
2004
2005 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2006 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2007
2008 Builder.restoreIP(OldIP);
2009
2010 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2011 Value *Base =
2012 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2013 // Store the pointer to the variable
2014 Value *Addr = Builder.CreateStructGEP(
2015 DependInfo, Base,
2016 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2017 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2018 Builder.CreateStore(DepValPtr, Addr);
2019 // Store the size of the variable
2020 Value *Size = Builder.CreateStructGEP(
2021 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2022 Builder.CreateStore(
2023 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2024 Size);
2025 // Store the dependency kind
2026 Value *Flags = Builder.CreateStructGEP(
2027 DependInfo, Base,
2028 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2029 Builder.CreateStore(
2030 ConstantInt::get(Builder.getInt8Ty(),
2031 static_cast<unsigned int>(Dep.DepKind)),
2032 Flags);
2033 }
2034 return DepArray;
2035}
2036
2037/// Create the task duplication function passed to kmpc_taskloop.
2038Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2039 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2040 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2041 if (!DupCB)
2043 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2044
2045 // From OpenMP Runtime p_task_dup_t:
2046 // Routine optionally generated by the compiler for setting the lastprivate
2047 // flag and calling needed constructors for private/firstprivate objects (used
2048 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2049 // lastprivate flag.
2050 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2051
2052 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2053
2054 FunctionType *DupFuncTy = FunctionType::get(
2055 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2056 /*isVarArg=*/false);
2057
2058 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2059 "omp_taskloop_dup", M);
2060 Value *DestTaskArg = DupFunction->getArg(0);
2061 Value *SrcTaskArg = DupFunction->getArg(1);
2062 Value *LastprivateFlagArg = DupFunction->getArg(2);
2063 DestTaskArg->setName("dest_task");
2064 SrcTaskArg->setName("src_task");
2065 LastprivateFlagArg->setName("lastprivate_flag");
2066
2067 IRBuilderBase::InsertPointGuard Guard(Builder);
2068 Builder.SetInsertPoint(
2069 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2070
2071 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2072 Type *TaskWithPrivatesTy =
2073 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2074 Value *TaskPrivates = Builder.CreateGEP(
2075 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2076 Value *ContextPtr = Builder.CreateGEP(
2077 PrivatesTy, TaskPrivates,
2078 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2079 return ContextPtr;
2080 };
2081
2082 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2083 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2084
2085 DestTaskContextPtr->setName("destPtr");
2086 SrcTaskContextPtr->setName("srcPtr");
2087
2088 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2089 DupFunction->getEntryBlock().begin());
2090 InsertPointTy CodeGenIP = Builder.saveIP();
2091 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2092 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2093 if (!AfterIPOrError)
2094 return AfterIPOrError.takeError();
2095 Builder.restoreIP(*AfterIPOrError);
2096
2097 Builder.CreateRetVoid();
2098
2099 return DupFunction;
2100}
2101
2102OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2103 const LocationDescription &Loc, InsertPointTy AllocaIP,
2104 BodyGenCallbackTy BodyGenCB,
2105 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2106 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2107 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2108 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2109 Value *TaskContextStructPtrVal) {
2110
2111 if (!updateToLocation(Loc))
2112 return InsertPointTy();
2113
2114 uint32_t SrcLocStrSize;
2115 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2116 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2117
2118 BasicBlock *TaskloopExitBB =
2119 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2120 BasicBlock *TaskloopBodyBB =
2121 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2122 BasicBlock *TaskloopAllocaBB =
2123 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2124
2125 InsertPointTy TaskloopAllocaIP =
2126 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2127 InsertPointTy TaskloopBodyIP =
2128 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2129
2130 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2131 return Err;
2132
2133 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2134 if (!result) {
2135 return result.takeError();
2136 }
2137
2138 llvm::CanonicalLoopInfo *CLI = result.get();
2139 OutlineInfo OI;
2140 OI.EntryBB = TaskloopAllocaBB;
2141 OI.OuterAllocaBB = AllocaIP.getBlock();
2142 OI.ExitBB = TaskloopExitBB;
2143
2144 // Add the thread ID argument.
2145 SmallVector<Instruction *> ToBeDeleted;
2146 // dummy instruction to be used as a fake argument
2147 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2148 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2149 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2150 TaskloopAllocaIP, "lb", false, true);
2151 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2152 TaskloopAllocaIP, "ub", false, true);
2153 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2154 TaskloopAllocaIP, "step", false, true);
2155 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2156 // aggregate struct
2157 OI.Inputs.insert(FakeLB);
2158 OI.Inputs.insert(FakeUB);
2159 OI.Inputs.insert(FakeStep);
2160 if (TaskContextStructPtrVal)
2161 OI.Inputs.insert(TaskContextStructPtrVal);
2162 assert(((TaskContextStructPtrVal && DupCB) ||
2163 (!TaskContextStructPtrVal && !DupCB)) &&
2164 "Task context struct ptr and duplication callback must be both set "
2165 "or both null");
2166
2167 // It isn't safe to run the duplication bodygen callback inside the post
2168 // outlining callback so this has to be run now before we know the real task
2169 // shareds structure type.
2170 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2171 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2172 Type *FakeSharedsTy = StructType::get(
2173 Builder.getContext(),
2174 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2175 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2176 FakeSharedsTy,
2177 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2178 if (!TaskDupFnOrErr) {
2179 return TaskDupFnOrErr.takeError();
2180 }
2181 Value *TaskDupFn = *TaskDupFnOrErr;
2182
2183 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2184 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2185 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2186 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2187 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2188 // Replace the Stale CI by appropriate RTL function call.
2189 assert(OutlinedFn.hasOneUse() &&
2190 "there must be a single user for the outlined function");
2191 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2192
2193 /* Create the casting for the Bounds Values that can be used when outlining
2194 * to replace the uses of the fakes with real values */
2195 BasicBlock *CodeReplBB = StaleCI->getParent();
2196 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2197 Value *CastedLBVal =
2198 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2199 Value *CastedUBVal =
2200 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2201 Value *CastedStepVal =
2202 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2203
2204 Builder.SetInsertPoint(StaleCI);
2205
2206 // Gather the arguments for emitting the runtime call for
2207 // @__kmpc_omp_task_alloc
2208 Function *TaskAllocFn =
2209 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2210
2211 Value *ThreadID = getOrCreateThreadID(Ident);
2212
2213 if (!NoGroup) {
2214 // Emit runtime call for @__kmpc_taskgroup
2215 Function *TaskgroupFn =
2216 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2217 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2218 }
2219
2220 // `flags` Argument Configuration
2221 // Task is tied if (Flags & 1) == 1.
2222 // Task is untied if (Flags & 1) == 0.
2223 // Task is final if (Flags & 2) == 2.
2224 // Task is not final if (Flags & 2) == 0.
2225 // Task is mergeable if (Flags & 4) == 4.
2226 // Task is not mergeable if (Flags & 4) == 0.
2227 // Task is priority if (Flags & 32) == 32.
2228 // Task is not priority if (Flags & 32) == 0.
2229 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2230 if (Final)
2231 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2232 if (Mergeable)
2233 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2234 if (Priority)
2235 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2236
2237 Value *TaskSize = Builder.getInt64(
2238 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2239
2240 AllocaInst *ArgStructAlloca =
2242 assert(ArgStructAlloca &&
2243 "Unable to find the alloca instruction corresponding to arguments "
2244 "for extracted function");
2245 std::optional<TypeSize> ArgAllocSize =
2246 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2247 assert(ArgAllocSize &&
2248 "Unable to determine size of arguments for extracted function");
2249 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2250
2251 // Emit the @__kmpc_omp_task_alloc runtime call
2252 // The runtime call returns a pointer to an area where the task captured
2253 // variables must be copied before the task is run (TaskData)
2254 CallInst *TaskData = Builder.CreateCall(
2255 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2256 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2257 /*task_func=*/&OutlinedFn});
2258
2259 Value *Shareds = StaleCI->getArgOperand(1);
2260 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2261 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2262 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2263 SharedsSize);
2264 // Get the pointer to loop lb, ub, step from task ptr
2265 // and set up the lowerbound,upperbound and step values
2266 llvm::Value *Lb = Builder.CreateGEP(
2267 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2268
2269 llvm::Value *Ub = Builder.CreateGEP(
2270 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2271
2272 llvm::Value *Step = Builder.CreateGEP(
2273 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2274 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2275
2276 // set up the arguments for emitting kmpc_taskloop runtime call
2277 // setting values for ifval, nogroup, sched, grainsize, task_dup
2278 Value *IfCondVal =
2279 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2280 : Builder.getInt32(1);
2281 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2282 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2283 Value *NoGroupVal = Builder.getInt32(1);
2284 Value *SchedVal = Builder.getInt32(Sched);
2285 Value *GrainSizeVal =
2286 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2287 : Builder.getInt64(0);
2288 Value *TaskDup = TaskDupFn;
2289
2290 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2291 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2292
2293 // taskloop runtime call
2294 Function *TaskloopFn =
2295 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2296 Builder.CreateCall(TaskloopFn, Args);
2297
2298 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2299 // nogroup is not defined
2300 if (!NoGroup) {
2301 Function *EndTaskgroupFn =
2302 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2303 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2304 }
2305
2306 StaleCI->eraseFromParent();
2307
2308 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2309
2310 LoadInst *SharedsOutlined =
2311 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2312 OutlinedFn.getArg(1)->replaceUsesWithIf(
2313 SharedsOutlined,
2314 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2315
2316 Value *IV = CLI->getIndVar();
2317 Type *IVTy = IV->getType();
2318 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2319
2320 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2321 // UpperBound. These GEP's can be reused for loading the tasks respective
2322 // bounds.
2323 Value *TaskLB = nullptr;
2324 Value *TaskUB = nullptr;
2325 Value *TaskStep = nullptr;
2326 Value *LoadTaskLB = nullptr;
2327 Value *LoadTaskUB = nullptr;
2328 Value *LoadTaskStep = nullptr;
2329 for (Instruction &I : *TaskloopAllocaBB) {
2330 if (I.getOpcode() == Instruction::GetElementPtr) {
2331 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2332 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2333 switch (CI->getZExtValue()) {
2334 case 0:
2335 TaskLB = &I;
2336 break;
2337 case 1:
2338 TaskUB = &I;
2339 break;
2340 case 2:
2341 TaskStep = &I;
2342 break;
2343 }
2344 }
2345 } else if (I.getOpcode() == Instruction::Load) {
2346 LoadInst &Load = cast<LoadInst>(I);
2347 if (Load.getPointerOperand() == TaskLB) {
2348 assert(TaskLB != nullptr && "Expected value for TaskLB");
2349 LoadTaskLB = &I;
2350 } else if (Load.getPointerOperand() == TaskUB) {
2351 assert(TaskUB != nullptr && "Expected value for TaskUB");
2352 LoadTaskUB = &I;
2353 } else if (Load.getPointerOperand() == TaskStep) {
2354 assert(TaskStep != nullptr && "Expected value for TaskStep");
2355 LoadTaskStep = &I;
2356 }
2357 }
2358 }
2359
2360 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2361
2362 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2363 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2364 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2365 Value *TripCountMinusOne = Builder.CreateSDiv(
2366 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2367 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2368 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2369 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2370 // set the trip count in the CLI
2371 CLI->setTripCount(CastedTripCount);
2372
2373 Builder.SetInsertPoint(CLI->getBody(),
2374 CLI->getBody()->getFirstInsertionPt());
2375
2376 if (NumOfCollapseLoops > 1) {
2377 llvm::SmallVector<User *> UsersToReplace;
2378 // When using the collapse clause, the bounds of the loop have to be
2379 // adjusted to properly represent the iterator of the outer loop.
2380 Value *IVPlusTaskLB = Builder.CreateAdd(
2381 CLI->getIndVar(),
2382 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2383 // To ensure every Use is correctly captured, we first want to record
2384 // which users to replace the value in, and then replace the value.
2385 for (auto IVUse = CLI->getIndVar()->uses().begin();
2386 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2387 User *IVUser = IVUse->getUser();
2388 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2389 if (Op->getOpcode() == Instruction::URem ||
2390 Op->getOpcode() == Instruction::UDiv) {
2391 UsersToReplace.push_back(IVUser);
2392 }
2393 }
2394 }
2395 for (User *User : UsersToReplace) {
2396 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2397 }
2398 } else {
2399 // The canonical loop is generated with a fixed lower bound. We need to
2400 // update the index calculation code to use the task's lower bound. The
2401 // generated code looks like this:
2402 // %omp_loop.iv = phi ...
2403 // ...
2404 // %tmp = mul [type] %omp_loop.iv, step
2405 // %user_index = add [type] tmp, lb
2406 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2407 // of the normalised induction variable:
2408 // 1. This one: converting the normalised IV to the user IV
2409 // 2. The increment (add)
2410 // 3. The comparison against the trip count (icmp)
2411 // (1) is the only use that is a mul followed by an add so this cannot
2412 // match other IR.
2413 assert(CLI->getIndVar()->getNumUses() == 3 &&
2414 "Canonical loop should have exactly three uses of the ind var");
2415 for (User *IVUser : CLI->getIndVar()->users()) {
2416 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2417 if (Mul->getOpcode() == Instruction::Mul) {
2418 for (User *MulUser : Mul->users()) {
2419 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2420 if (Add->getOpcode() == Instruction::Add) {
2421 Add->setOperand(1, CastedTaskLB);
2422 }
2423 }
2424 }
2425 }
2426 }
2427 }
2428 }
2429
2430 FakeLB->replaceAllUsesWith(CastedLBVal);
2431 FakeUB->replaceAllUsesWith(CastedUBVal);
2432 FakeStep->replaceAllUsesWith(CastedStepVal);
2433 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2434 I->eraseFromParent();
2435 }
2436 };
2437
2438 addOutlineInfo(std::move(OI));
2439 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2440 return Builder.saveIP();
2441}
2442
2445 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2446 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2447 llvm::Type::getInt32Ty(M.getContext()));
2448}
2449
2451 const LocationDescription &Loc, InsertPointTy AllocaIP,
2452 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2453 SmallVector<DependData> Dependencies, AffinityData Affinities,
2454 bool Mergeable, Value *EventHandle, Value *Priority) {
2455
2456 if (!updateToLocation(Loc))
2457 return InsertPointTy();
2458
2459 uint32_t SrcLocStrSize;
2460 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2461 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2462 // The current basic block is split into four basic blocks. After outlining,
2463 // they will be mapped as follows:
2464 // ```
2465 // def current_fn() {
2466 // current_basic_block:
2467 // br label %task.exit
2468 // task.exit:
2469 // ; instructions after task
2470 // }
2471 // def outlined_fn() {
2472 // task.alloca:
2473 // br label %task.body
2474 // task.body:
2475 // ret void
2476 // }
2477 // ```
2478 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2479 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2480 BasicBlock *TaskAllocaBB =
2481 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2482
2483 InsertPointTy TaskAllocaIP =
2484 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2485 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2486 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2487 return Err;
2488
2489 OutlineInfo OI;
2490 OI.EntryBB = TaskAllocaBB;
2491 OI.OuterAllocaBB = AllocaIP.getBlock();
2492 OI.ExitBB = TaskExitBB;
2493
2494 // Add the thread ID argument.
2497 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2498
2499 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2500 Affinities, Mergeable, Priority, EventHandle,
2501 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
2502 // Replace the Stale CI by appropriate RTL function call.
2503 assert(OutlinedFn.hasOneUse() &&
2504 "there must be a single user for the outlined function");
2505 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2506
2507 // HasShareds is true if any variables are captured in the outlined region,
2508 // false otherwise.
2509 bool HasShareds = StaleCI->arg_size() > 1;
2510 Builder.SetInsertPoint(StaleCI);
2511
2512 // Gather the arguments for emitting the runtime call for
2513 // @__kmpc_omp_task_alloc
2514 Function *TaskAllocFn =
2515 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2516
2517 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2518 // call.
2519 Value *ThreadID = getOrCreateThreadID(Ident);
2520
2521 // Argument - `flags`
2522 // Task is tied iff (Flags & 1) == 1.
2523 // Task is untied iff (Flags & 1) == 0.
2524 // Task is final iff (Flags & 2) == 2.
2525 // Task is not final iff (Flags & 2) == 0.
2526 // Task is mergeable iff (Flags & 4) == 4.
2527 // Task is not mergeable iff (Flags & 4) == 0.
2528 // Task is priority iff (Flags & 32) == 32.
2529 // Task is not priority iff (Flags & 32) == 0.
2530 // TODO: Handle the other flags.
2531 Value *Flags = Builder.getInt32(Tied);
2532 if (Final) {
2533 Value *FinalFlag =
2534 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2535 Flags = Builder.CreateOr(FinalFlag, Flags);
2536 }
2537
2538 if (Mergeable)
2539 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2540 if (Priority)
2541 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2542
2543 // Argument - `sizeof_kmp_task_t` (TaskSize)
2544 // Tasksize refers to the size in bytes of kmp_task_t data structure
2545 // including private vars accessed in task.
2546 // TODO: add kmp_task_t_with_privates (privates)
2547 Value *TaskSize = Builder.getInt64(
2548 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2549
2550 // Argument - `sizeof_shareds` (SharedsSize)
2551 // SharedsSize refers to the shareds array size in the kmp_task_t data
2552 // structure.
2553 Value *SharedsSize = Builder.getInt64(0);
2554 if (HasShareds) {
2555 AllocaInst *ArgStructAlloca =
2557 assert(ArgStructAlloca &&
2558 "Unable to find the alloca instruction corresponding to arguments "
2559 "for extracted function");
2560 std::optional<TypeSize> ArgAllocSize =
2561 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2562 assert(ArgAllocSize &&
2563 "Unable to determine size of arguments for extracted function");
2564 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2565 }
2566 // Emit the @__kmpc_omp_task_alloc runtime call
2567 // The runtime call returns a pointer to an area where the task captured
2568 // variables must be copied before the task is run (TaskData)
2570 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2571 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2572 /*task_func=*/&OutlinedFn});
2573
2574 if (Affinities.Count && Affinities.Info) {
2576 OMPRTL___kmpc_omp_reg_task_with_affinity);
2577
2578 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2579 Affinities.Count, Affinities.Info});
2580 }
2581
2582 // Emit detach clause initialization.
2583 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2584 // task_descriptor);
2585 if (EventHandle) {
2587 OMPRTL___kmpc_task_allow_completion_event);
2588 llvm::Value *EventVal =
2589 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2590 llvm::Value *EventHandleAddr =
2591 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2592 Builder.getPtrTy(0));
2593 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2594 Builder.CreateStore(EventVal, EventHandleAddr);
2595 }
2596 // Copy the arguments for outlined function
2597 if (HasShareds) {
2598 Value *Shareds = StaleCI->getArgOperand(1);
2599 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2600 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2601 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2602 SharedsSize);
2603 }
2604
2605 if (Priority) {
2606 //
2607 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2608 // we populate the priority information into the "kmp_task_t" here
2609 //
2610 // The struct "kmp_task_t" definition is available in kmp.h
2611 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2612 // data2 is used for priority
2613 //
2614 Type *Int32Ty = Builder.getInt32Ty();
2615 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2616 // kmp_task_t* => { ptr }
2617 Type *TaskPtr = StructType::get(VoidPtr);
2618 Value *TaskGEP =
2619 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2620 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2621 Type *TaskStructType = StructType::get(
2622 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2623 Value *PriorityData = Builder.CreateInBoundsGEP(
2624 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2625 // kmp_cmplrdata_t => { ptr, ptr }
2626 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2627 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2628 PriorityData, {Zero, Zero});
2629 Builder.CreateStore(Priority, CmplrData);
2630 }
2631
2632 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2633
2634 // In the presence of the `if` clause, the following IR is generated:
2635 // ...
2636 // %data = call @__kmpc_omp_task_alloc(...)
2637 // br i1 %if_condition, label %then, label %else
2638 // then:
2639 // call @__kmpc_omp_task(...)
2640 // br label %exit
2641 // else:
2642 // ;; Wait for resolution of dependencies, if any, before
2643 // ;; beginning the task
2644 // call @__kmpc_omp_wait_deps(...)
2645 // call @__kmpc_omp_task_begin_if0(...)
2646 // call @outlined_fn(...)
2647 // call @__kmpc_omp_task_complete_if0(...)
2648 // br label %exit
2649 // exit:
2650 // ...
2651 if (IfCondition) {
2652 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2653 // terminator.
2654 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2655 Instruction *IfTerminator =
2656 Builder.GetInsertPoint()->getParent()->getTerminator();
2657 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2658 Builder.SetInsertPoint(IfTerminator);
2659 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2660 &ElseTI);
2661 Builder.SetInsertPoint(ElseTI);
2662
2663 if (Dependencies.size()) {
2664 Function *TaskWaitFn =
2665 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2667 TaskWaitFn,
2668 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2669 ConstantInt::get(Builder.getInt32Ty(), 0),
2671 }
2672 Function *TaskBeginFn =
2673 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2674 Function *TaskCompleteFn =
2675 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2676 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2677 CallInst *CI = nullptr;
2678 if (HasShareds)
2679 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2680 else
2681 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2682 CI->setDebugLoc(StaleCI->getDebugLoc());
2683 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2684 Builder.SetInsertPoint(ThenTI);
2685 }
2686
2687 if (Dependencies.size()) {
2688 Function *TaskFn =
2689 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2691 TaskFn,
2692 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2693 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2695
2696 } else {
2697 // Emit the @__kmpc_omp_task runtime call to spawn the task
2698 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2699 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2700 }
2701
2702 StaleCI->eraseFromParent();
2703
2704 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2705 if (HasShareds) {
2706 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2707 OutlinedFn.getArg(1)->replaceUsesWithIf(
2708 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2709 }
2710
2711 for (Instruction *I : llvm::reverse(ToBeDeleted))
2712 I->eraseFromParent();
2713 };
2714
2715 addOutlineInfo(std::move(OI));
2716 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2717
2718 return Builder.saveIP();
2719}
2720
2723 InsertPointTy AllocaIP,
2724 BodyGenCallbackTy BodyGenCB) {
2725 if (!updateToLocation(Loc))
2726 return InsertPointTy();
2727
2728 uint32_t SrcLocStrSize;
2729 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2730 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2731 Value *ThreadID = getOrCreateThreadID(Ident);
2732
2733 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2734 Function *TaskgroupFn =
2735 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2736 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2737
2738 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2739 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2740 return Err;
2741
2742 Builder.SetInsertPoint(TaskgroupExitBB);
2743 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2744 Function *EndTaskgroupFn =
2745 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2746 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2747
2748 return Builder.saveIP();
2749}
2750
2752 const LocationDescription &Loc, InsertPointTy AllocaIP,
2754 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2755 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2756
2757 if (!updateToLocation(Loc))
2758 return Loc.IP;
2759
2760 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2761
2762 // Each section is emitted as a switch case
2763 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2764 // -> OMP.createSection() which generates the IR for each section
2765 // Iterate through all sections and emit a switch construct:
2766 // switch (IV) {
2767 // case 0:
2768 // <SectionStmt[0]>;
2769 // break;
2770 // ...
2771 // case <NumSection> - 1:
2772 // <SectionStmt[<NumSection> - 1]>;
2773 // break;
2774 // }
2775 // ...
2776 // section_loop.after:
2777 // <FiniCB>;
2778 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2779 Builder.restoreIP(CodeGenIP);
2781 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2782 Function *CurFn = Continue->getParent();
2783 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2784
2785 unsigned CaseNumber = 0;
2786 for (auto SectionCB : SectionCBs) {
2788 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2789 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2790 Builder.SetInsertPoint(CaseBB);
2791 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
2792 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2793 CaseEndBr->getIterator()}))
2794 return Err;
2795 CaseNumber++;
2796 }
2797 // remove the existing terminator from body BB since there can be no
2798 // terminators after switch/case
2799 return Error::success();
2800 };
2801 // Loop body ends here
2802 // LowerBound, UpperBound, and STride for createCanonicalLoop
2803 Type *I32Ty = Type::getInt32Ty(M.getContext());
2804 Value *LB = ConstantInt::get(I32Ty, 0);
2805 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2806 Value *ST = ConstantInt::get(I32Ty, 1);
2808 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2809 if (!LoopInfo)
2810 return LoopInfo.takeError();
2811
2812 InsertPointOrErrorTy WsloopIP =
2813 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2814 WorksharingLoopType::ForStaticLoop, !IsNowait);
2815 if (!WsloopIP)
2816 return WsloopIP.takeError();
2817 InsertPointTy AfterIP = *WsloopIP;
2818
2819 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2820 assert(LoopFini && "Bad structure of static workshare loop finalization");
2821
2822 // Apply the finalization callback in LoopAfterBB
2823 auto FiniInfo = FinalizationStack.pop_back_val();
2824 assert(FiniInfo.DK == OMPD_sections &&
2825 "Unexpected finalization stack state!");
2826 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2827 return Err;
2828
2829 return AfterIP;
2830}
2831
2834 BodyGenCallbackTy BodyGenCB,
2835 FinalizeCallbackTy FiniCB) {
2836 if (!updateToLocation(Loc))
2837 return Loc.IP;
2838
2839 auto FiniCBWrapper = [&](InsertPointTy IP) {
2840 if (IP.getBlock()->end() != IP.getPoint())
2841 return FiniCB(IP);
2842 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2843 // will fail because that function requires the Finalization Basic Block to
2844 // have a terminator, which is already removed by EmitOMPRegionBody.
2845 // IP is currently at cancelation block.
2846 // We need to backtrack to the condition block to fetch
2847 // the exit block and create a branch from cancelation
2848 // to exit block.
2850 Builder.restoreIP(IP);
2851 auto *CaseBB = Loc.IP.getBlock();
2852 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2853 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2854 Instruction *I = Builder.CreateBr(ExitBB);
2855 IP = InsertPointTy(I->getParent(), I->getIterator());
2856 return FiniCB(IP);
2857 };
2858
2859 Directive OMPD = Directive::OMPD_sections;
2860 // Since we are using Finalization Callback here, HasFinalize
2861 // and IsCancellable have to be true
2862 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2863 /*Conditional*/ false, /*hasFinalize*/ true,
2864 /*IsCancellable*/ true);
2865}
2866
2872
2873Value *OpenMPIRBuilder::getGPUThreadID() {
2876 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2877 {});
2878}
2879
2880Value *OpenMPIRBuilder::getGPUWarpSize() {
2882 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2883}
2884
2885Value *OpenMPIRBuilder::getNVPTXWarpID() {
2886 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2887 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2888}
2889
2890Value *OpenMPIRBuilder::getNVPTXLaneID() {
2891 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2892 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2893 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2894 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2895 "nvptx_lane_id");
2896}
2897
2898Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2899 Type *ToType) {
2900 Type *FromType = From->getType();
2901 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2902 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2903 assert(FromSize > 0 && "From size must be greater than zero");
2904 assert(ToSize > 0 && "To size must be greater than zero");
2905 if (FromType == ToType)
2906 return From;
2907 if (FromSize == ToSize)
2908 return Builder.CreateBitCast(From, ToType);
2909 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2910 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2911 InsertPointTy SaveIP = Builder.saveIP();
2912 Builder.restoreIP(AllocaIP);
2913 Value *CastItem = Builder.CreateAlloca(ToType);
2914 Builder.restoreIP(SaveIP);
2915
2916 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2917 CastItem, Builder.getPtrTy(0));
2918 Builder.CreateStore(From, ValCastItem);
2919 return Builder.CreateLoad(ToType, CastItem);
2920}
2921
2922Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2923 Value *Element,
2924 Type *ElementType,
2925 Value *Offset) {
2926 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2927 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2928
2929 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2930 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2931 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2932 Value *WarpSize =
2933 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2935 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2936 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2937 Value *WarpSizeCast =
2938 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2939 Value *ShuffleCall =
2940 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2941 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2942}
2943
2944void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2945 Value *DstAddr, Type *ElemType,
2946 Value *Offset, Type *ReductionArrayTy,
2947 bool IsByRefElem) {
2948 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2949 // Create the loop over the big sized data.
2950 // ptr = (void*)Elem;
2951 // ptrEnd = (void*) Elem + 1;
2952 // Step = 8;
2953 // while (ptr + Step < ptrEnd)
2954 // shuffle((int64_t)*ptr);
2955 // Step = 4;
2956 // while (ptr + Step < ptrEnd)
2957 // shuffle((int32_t)*ptr);
2958 // ...
2959 Type *IndexTy = Builder.getIndexTy(
2960 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2961 Value *ElemPtr = DstAddr;
2962 Value *Ptr = SrcAddr;
2963 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2964 if (Size < IntSize)
2965 continue;
2966 Type *IntType = Builder.getIntNTy(IntSize * 8);
2967 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2968 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2969 Value *SrcAddrGEP =
2970 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2971 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2972 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2973
2974 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2975 if ((Size / IntSize) > 1) {
2976 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2977 SrcAddrGEP, Builder.getPtrTy());
2978 BasicBlock *PreCondBB =
2979 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2980 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2981 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2982 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2983 emitBlock(PreCondBB, CurFunc);
2984 PHINode *PhiSrc =
2985 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2986 PhiSrc->addIncoming(Ptr, CurrentBB);
2987 PHINode *PhiDest =
2988 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2989 PhiDest->addIncoming(ElemPtr, CurrentBB);
2990 Ptr = PhiSrc;
2991 ElemPtr = PhiDest;
2992 Value *PtrDiff = Builder.CreatePtrDiff(
2993 Builder.getInt8Ty(), PtrEnd,
2994 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2995 Builder.CreateCondBr(
2996 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2997 ExitBB);
2998 emitBlock(ThenBB, CurFunc);
2999 Value *Res = createRuntimeShuffleFunction(
3000 AllocaIP,
3001 Builder.CreateAlignedLoad(
3002 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3003 IntType, Offset);
3004 Builder.CreateAlignedStore(Res, ElemPtr,
3005 M.getDataLayout().getPrefTypeAlign(ElemType));
3006 Value *LocalPtr =
3007 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3008 Value *LocalElemPtr =
3009 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3010 PhiSrc->addIncoming(LocalPtr, ThenBB);
3011 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3012 emitBranch(PreCondBB);
3013 emitBlock(ExitBB, CurFunc);
3014 } else {
3015 Value *Res = createRuntimeShuffleFunction(
3016 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3017 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3018 Res->getType()->getScalarSizeInBits())
3019 Res = Builder.CreateTrunc(Res, ElemType);
3020 Builder.CreateStore(Res, ElemPtr);
3021 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3022 ElemPtr =
3023 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3024 }
3025 Size = Size % IntSize;
3026 }
3027}
3028
3029Error OpenMPIRBuilder::emitReductionListCopy(
3030 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3031 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3032 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3033 Type *IndexTy = Builder.getIndexTy(
3034 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3035 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3036
3037 // Iterates, element-by-element, through the source Reduce list and
3038 // make a copy.
3039 for (auto En : enumerate(ReductionInfos)) {
3040 const ReductionInfo &RI = En.value();
3041 Value *SrcElementAddr = nullptr;
3042 AllocaInst *DestAlloca = nullptr;
3043 Value *DestElementAddr = nullptr;
3044 Value *DestElementPtrAddr = nullptr;
3045 // Should we shuffle in an element from a remote lane?
3046 bool ShuffleInElement = false;
3047 // Set to true to update the pointer in the dest Reduce list to a
3048 // newly created element.
3049 bool UpdateDestListPtr = false;
3050
3051 // Step 1.1: Get the address for the src element in the Reduce list.
3052 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3053 ReductionArrayTy, SrcBase,
3054 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3055 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3056
3057 // Step 1.2: Create a temporary to store the element in the destination
3058 // Reduce list.
3059 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3060 ReductionArrayTy, DestBase,
3061 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3062 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3063 switch (Action) {
3065 InsertPointTy CurIP = Builder.saveIP();
3066 Builder.restoreIP(AllocaIP);
3067
3068 Type *DestAllocaType =
3069 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3070 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3071 ".omp.reduction.element");
3072 DestAlloca->setAlignment(
3073 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3074 DestElementAddr = DestAlloca;
3075 DestElementAddr =
3076 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3077 DestElementAddr->getName() + ".ascast");
3078 Builder.restoreIP(CurIP);
3079 ShuffleInElement = true;
3080 UpdateDestListPtr = true;
3081 break;
3082 }
3084 DestElementAddr =
3085 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3086 break;
3087 }
3088 }
3089
3090 // Now that all active lanes have read the element in the
3091 // Reduce list, shuffle over the value from the remote lane.
3092 if (ShuffleInElement) {
3093 Type *ShuffleType = RI.ElementType;
3094 Value *ShuffleSrcAddr = SrcElementAddr;
3095 Value *ShuffleDestAddr = DestElementAddr;
3096 AllocaInst *LocalStorage = nullptr;
3097
3098 if (IsByRefElem) {
3099 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3100 assert(RI.ByRefAllocatedType &&
3101 "Expected by-ref allocated type to be set");
3102 // For by-ref reductions, we need to copy from the remote lane the
3103 // actual value of the partial reduction computed by that remote lane;
3104 // rather than, for example, a pointer to that data or, even worse, a
3105 // pointer to the descriptor of the by-ref reduction element.
3106 ShuffleType = RI.ByRefElementType;
3107
3108 InsertPointOrErrorTy GenResult =
3109 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3110
3111 if (!GenResult)
3112 return GenResult.takeError();
3113
3114 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3115
3116 {
3117 InsertPointTy OldIP = Builder.saveIP();
3118 Builder.restoreIP(AllocaIP);
3119
3120 LocalStorage = Builder.CreateAlloca(ShuffleType);
3121 Builder.restoreIP(OldIP);
3122 ShuffleDestAddr = LocalStorage;
3123 }
3124 }
3125
3126 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3127 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3128
3129 if (IsByRefElem) {
3130 // Copy descriptor from source and update base_ptr to shuffled data
3131 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3132 DestAlloca, Builder.getPtrTy(), ".ascast");
3133
3134 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3135 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3136 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3137
3138 if (!GenResult)
3139 return GenResult.takeError();
3140 }
3141 } else {
3142 switch (RI.EvaluationKind) {
3143 case EvalKind::Scalar: {
3144 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3145 // Store the source element value to the dest element address.
3146 Builder.CreateStore(Elem, DestElementAddr);
3147 break;
3148 }
3149 case EvalKind::Complex: {
3150 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3151 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3152 Value *SrcReal = Builder.CreateLoad(
3153 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3154 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3155 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3156 Value *SrcImg = Builder.CreateLoad(
3157 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3158
3159 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3160 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3161 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3162 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3163 Builder.CreateStore(SrcReal, DestRealPtr);
3164 Builder.CreateStore(SrcImg, DestImgPtr);
3165 break;
3166 }
3167 case EvalKind::Aggregate: {
3168 Value *SizeVal = Builder.getInt64(
3169 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3170 Builder.CreateMemCpy(
3171 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3172 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3173 SizeVal, false);
3174 break;
3175 }
3176 };
3177 }
3178
3179 // Step 3.1: Modify reference in dest Reduce list as needed.
3180 // Modifying the reference in Reduce list to point to the newly
3181 // created element. The element is live in the current function
3182 // scope and that of functions it invokes (i.e., reduce_function).
3183 // RemoteReduceData[i] = (void*)&RemoteElem
3184 if (UpdateDestListPtr) {
3185 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3186 DestElementAddr, Builder.getPtrTy(),
3187 DestElementAddr->getName() + ".ascast");
3188 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3189 }
3190 }
3191
3192 return Error::success();
3193}
3194
3195Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3196 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3197 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3198 InsertPointTy SavedIP = Builder.saveIP();
3199 LLVMContext &Ctx = M.getContext();
3200 FunctionType *FuncTy = FunctionType::get(
3201 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3202 /* IsVarArg */ false);
3203 Function *WcFunc =
3205 "_omp_reduction_inter_warp_copy_func", &M);
3206 WcFunc->setAttributes(FuncAttrs);
3207 WcFunc->addParamAttr(0, Attribute::NoUndef);
3208 WcFunc->addParamAttr(1, Attribute::NoUndef);
3209 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3210 Builder.SetInsertPoint(EntryBB);
3211
3212 // ReduceList: thread local Reduce list.
3213 // At the stage of the computation when this function is called, partially
3214 // aggregated values reside in the first lane of every active warp.
3215 Argument *ReduceListArg = WcFunc->getArg(0);
3216 // NumWarps: number of warps active in the parallel region. This could
3217 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3218 Argument *NumWarpsArg = WcFunc->getArg(1);
3219
3220 // This array is used as a medium to transfer, one reduce element at a time,
3221 // the data from the first lane of every warp to lanes in the first warp
3222 // in order to perform the final step of a reduction in a parallel region
3223 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3224 // for reduced latency, as well as to have a distinct copy for concurrently
3225 // executing target regions. The array is declared with common linkage so
3226 // as to be shared across compilation units.
3227 StringRef TransferMediumName =
3228 "__openmp_nvptx_data_transfer_temporary_storage";
3229 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3230 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3231 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3232 if (!TransferMedium) {
3233 TransferMedium = new GlobalVariable(
3234 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3235 UndefValue::get(ArrayTy), TransferMediumName,
3236 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3237 /*AddressSpace=*/3);
3238 }
3239
3240 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3241 Value *GPUThreadID = getGPUThreadID();
3242 // nvptx_lane_id = nvptx_id % warpsize
3243 Value *LaneID = getNVPTXLaneID();
3244 // nvptx_warp_id = nvptx_id / warpsize
3245 Value *WarpID = getNVPTXWarpID();
3246
3247 InsertPointTy AllocaIP =
3248 InsertPointTy(Builder.GetInsertBlock(),
3249 Builder.GetInsertBlock()->getFirstInsertionPt());
3250 Type *Arg0Type = ReduceListArg->getType();
3251 Type *Arg1Type = NumWarpsArg->getType();
3252 Builder.restoreIP(AllocaIP);
3253 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3254 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3255 AllocaInst *NumWarpsAlloca =
3256 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3257 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3258 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3259 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3260 NumWarpsAlloca, Builder.getPtrTy(0),
3261 NumWarpsAlloca->getName() + ".ascast");
3262 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3263 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3264 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3265 InsertPointTy CodeGenIP =
3266 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3267 Builder.restoreIP(CodeGenIP);
3268
3269 Value *ReduceList =
3270 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3271
3272 for (auto En : enumerate(ReductionInfos)) {
3273 //
3274 // Warp master copies reduce element to transfer medium in __shared__
3275 // memory.
3276 //
3277 const ReductionInfo &RI = En.value();
3278 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3279 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3280 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3281 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3282 Type *CType = Builder.getIntNTy(TySize * 8);
3283
3284 unsigned NumIters = RealTySize / TySize;
3285 if (NumIters == 0)
3286 continue;
3287 Value *Cnt = nullptr;
3288 Value *CntAddr = nullptr;
3289 BasicBlock *PrecondBB = nullptr;
3290 BasicBlock *ExitBB = nullptr;
3291 if (NumIters > 1) {
3292 CodeGenIP = Builder.saveIP();
3293 Builder.restoreIP(AllocaIP);
3294 CntAddr =
3295 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3296
3297 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3298 CntAddr->getName() + ".ascast");
3299 Builder.restoreIP(CodeGenIP);
3300 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3301 CntAddr,
3302 /*Volatile=*/false);
3303 PrecondBB = BasicBlock::Create(Ctx, "precond");
3304 ExitBB = BasicBlock::Create(Ctx, "exit");
3305 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3306 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3307 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3308 /*Volatile=*/false);
3309 Value *Cmp = Builder.CreateICmpULT(
3310 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3311 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3312 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3313 }
3314
3315 // kmpc_barrier.
3316 InsertPointOrErrorTy BarrierIP1 =
3317 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3318 omp::Directive::OMPD_unknown,
3319 /* ForceSimpleCall */ false,
3320 /* CheckCancelFlag */ true);
3321 if (!BarrierIP1)
3322 return BarrierIP1.takeError();
3323 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3324 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3325 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3326
3327 // if (lane_id == 0)
3328 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3329 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3330 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3331
3332 // Reduce element = LocalReduceList[i]
3333 auto *RedListArrayTy =
3334 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3335 Type *IndexTy = Builder.getIndexTy(
3336 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3337 Value *ElemPtrPtr =
3338 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3339 {ConstantInt::get(IndexTy, 0),
3340 ConstantInt::get(IndexTy, En.index())});
3341 // elemptr = ((CopyType*)(elemptrptr)) + I
3342 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3343
3344 if (IsByRefElem) {
3345 InsertPointOrErrorTy GenRes =
3346 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3347
3348 if (!GenRes)
3349 return GenRes.takeError();
3350
3351 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3352 }
3353
3354 if (NumIters > 1)
3355 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3356
3357 // Get pointer to location in transfer medium.
3358 // MediumPtr = &medium[warp_id]
3359 Value *MediumPtr = Builder.CreateInBoundsGEP(
3360 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3361 // elem = *elemptr
3362 //*MediumPtr = elem
3363 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3364 // Store the source element value to the dest element address.
3365 Builder.CreateStore(Elem, MediumPtr,
3366 /*IsVolatile*/ true);
3367 Builder.CreateBr(MergeBB);
3368
3369 // else
3370 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3371 Builder.CreateBr(MergeBB);
3372
3373 // endif
3374 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3375 InsertPointOrErrorTy BarrierIP2 =
3376 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3377 omp::Directive::OMPD_unknown,
3378 /* ForceSimpleCall */ false,
3379 /* CheckCancelFlag */ true);
3380 if (!BarrierIP2)
3381 return BarrierIP2.takeError();
3382
3383 // Warp 0 copies reduce element from transfer medium
3384 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3385 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3386 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3387
3388 Value *NumWarpsVal =
3389 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3390 // Up to 32 threads in warp 0 are active.
3391 Value *IsActiveThread =
3392 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3393 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3394
3395 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3396
3397 // SecMediumPtr = &medium[tid]
3398 // SrcMediumVal = *SrcMediumPtr
3399 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3400 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3401 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3402 Value *TargetElemPtrPtr =
3403 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3404 {ConstantInt::get(IndexTy, 0),
3405 ConstantInt::get(IndexTy, En.index())});
3406 Value *TargetElemPtrVal =
3407 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3408 Value *TargetElemPtr = TargetElemPtrVal;
3409
3410 if (IsByRefElem) {
3411 InsertPointOrErrorTy GenRes =
3412 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3413
3414 if (!GenRes)
3415 return GenRes.takeError();
3416
3417 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3418 }
3419
3420 if (NumIters > 1)
3421 TargetElemPtr =
3422 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3423
3424 // *TargetElemPtr = SrcMediumVal;
3425 Value *SrcMediumValue =
3426 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3427 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3428 Builder.CreateBr(W0MergeBB);
3429
3430 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3431 Builder.CreateBr(W0MergeBB);
3432
3433 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3434
3435 if (NumIters > 1) {
3436 Cnt = Builder.CreateNSWAdd(
3437 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3438 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3439
3440 auto *CurFn = Builder.GetInsertBlock()->getParent();
3441 emitBranch(PrecondBB);
3442 emitBlock(ExitBB, CurFn);
3443 }
3444 RealTySize %= TySize;
3445 }
3446 }
3447
3448 Builder.CreateRetVoid();
3449 Builder.restoreIP(SavedIP);
3450
3451 return WcFunc;
3452}
3453
3454Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3455 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3456 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3457 LLVMContext &Ctx = M.getContext();
3458 FunctionType *FuncTy =
3459 FunctionType::get(Builder.getVoidTy(),
3460 {Builder.getPtrTy(), Builder.getInt16Ty(),
3461 Builder.getInt16Ty(), Builder.getInt16Ty()},
3462 /* IsVarArg */ false);
3463 Function *SarFunc =
3465 "_omp_reduction_shuffle_and_reduce_func", &M);
3466 SarFunc->setAttributes(FuncAttrs);
3467 SarFunc->addParamAttr(0, Attribute::NoUndef);
3468 SarFunc->addParamAttr(1, Attribute::NoUndef);
3469 SarFunc->addParamAttr(2, Attribute::NoUndef);
3470 SarFunc->addParamAttr(3, Attribute::NoUndef);
3471 SarFunc->addParamAttr(1, Attribute::SExt);
3472 SarFunc->addParamAttr(2, Attribute::SExt);
3473 SarFunc->addParamAttr(3, Attribute::SExt);
3474 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3475 Builder.SetInsertPoint(EntryBB);
3476
3477 // Thread local Reduce list used to host the values of data to be reduced.
3478 Argument *ReduceListArg = SarFunc->getArg(0);
3479 // Current lane id; could be logical.
3480 Argument *LaneIDArg = SarFunc->getArg(1);
3481 // Offset of the remote source lane relative to the current lane.
3482 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3483 // Algorithm version. This is expected to be known at compile time.
3484 Argument *AlgoVerArg = SarFunc->getArg(3);
3485
3486 Type *ReduceListArgType = ReduceListArg->getType();
3487 Type *LaneIDArgType = LaneIDArg->getType();
3488 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3489 Value *ReduceListAlloca = Builder.CreateAlloca(
3490 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3491 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3492 LaneIDArg->getName() + ".addr");
3493 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3494 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3495 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3496 AlgoVerArg->getName() + ".addr");
3497 ArrayType *RedListArrayTy =
3498 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3499
3500 // Create a local thread-private variable to host the Reduce list
3501 // from a remote lane.
3502 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3503 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3504
3505 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3506 ReduceListAlloca, ReduceListArgType,
3507 ReduceListAlloca->getName() + ".ascast");
3508 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3509 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3510 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3511 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3512 RemoteLaneOffsetAlloca->getName() + ".ascast");
3513 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3514 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3515 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3516 RemoteReductionListAlloca, Builder.getPtrTy(),
3517 RemoteReductionListAlloca->getName() + ".ascast");
3518
3519 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3520 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3521 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3522 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3523
3524 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3525 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3526 Value *RemoteLaneOffset =
3527 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3528 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3529
3530 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3531
3532 // This loop iterates through the list of reduce elements and copies,
3533 // element by element, from a remote lane in the warp to RemoteReduceList,
3534 // hosted on the thread's stack.
3535 Error EmitRedLsCpRes = emitReductionListCopy(
3536 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3537 ReduceList, RemoteListAddrCast, IsByRef,
3538 {RemoteLaneOffset, nullptr, nullptr});
3539
3540 if (EmitRedLsCpRes)
3541 return EmitRedLsCpRes;
3542
3543 // The actions to be performed on the Remote Reduce list is dependent
3544 // on the algorithm version.
3545 //
3546 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3547 // LaneId % 2 == 0 && Offset > 0):
3548 // do the reduction value aggregation
3549 //
3550 // The thread local variable Reduce list is mutated in place to host the
3551 // reduced data, which is the aggregated value produced from local and
3552 // remote lanes.
3553 //
3554 // Note that AlgoVer is expected to be a constant integer known at compile
3555 // time.
3556 // When AlgoVer==0, the first conjunction evaluates to true, making
3557 // the entire predicate true during compile time.
3558 // When AlgoVer==1, the second conjunction has only the second part to be
3559 // evaluated during runtime. Other conjunctions evaluates to false
3560 // during compile time.
3561 // When AlgoVer==2, the third conjunction has only the second part to be
3562 // evaluated during runtime. Other conjunctions evaluates to false
3563 // during compile time.
3564 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3565 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3566 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3567 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3568 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3569 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3570 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3571 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3572 Value *RemoteOffsetComp =
3573 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3574 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3575 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3576 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3577
3578 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3579 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3580 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3581
3582 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3583 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3584 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3585 ReduceList, Builder.getPtrTy());
3586 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3587 RemoteListAddrCast, Builder.getPtrTy());
3588 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3589 ->addFnAttr(Attribute::NoUnwind);
3590 Builder.CreateBr(MergeBB);
3591
3592 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3593 Builder.CreateBr(MergeBB);
3594
3595 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3596
3597 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3598 // Reduce list.
3599 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3600 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3601 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3602
3603 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3604 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3605 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3606 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3607
3608 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3609
3610 EmitRedLsCpRes = emitReductionListCopy(
3611 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3612 RemoteListAddrCast, ReduceList, IsByRef);
3613
3614 if (EmitRedLsCpRes)
3615 return EmitRedLsCpRes;
3616
3617 Builder.CreateBr(CpyMergeBB);
3618
3619 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3620 Builder.CreateBr(CpyMergeBB);
3621
3622 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3623
3624 Builder.CreateRetVoid();
3625
3626 return SarFunc;
3627}
3628
3630OpenMPIRBuilder::generateReductionDescriptor(
3631 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3632 Type *DescriptorType,
3633 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3634 DataPtrPtrGen) {
3635
3636 // Copy the source descriptor to preserve all metadata (rank, extents,
3637 // strides, etc.)
3638 Value *DescriptorSize =
3639 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3640 Builder.CreateMemCpy(
3641 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3642 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3643 DescriptorSize);
3644
3645 // Update the base pointer field to point to the local shuffled data
3646 Value *DataPtrField;
3647 InsertPointOrErrorTy GenResult =
3648 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3649
3650 if (!GenResult)
3651 return GenResult.takeError();
3652
3653 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3654 DataPtr, Builder.getPtrTy(), ".ascast"),
3655 DataPtrField);
3656
3657 return Builder.saveIP();
3658}
3659
3660Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3661 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3662 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3663 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3664 LLVMContext &Ctx = M.getContext();
3665 FunctionType *FuncTy = FunctionType::get(
3666 Builder.getVoidTy(),
3667 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3668 /* IsVarArg */ false);
3669 Function *LtGCFunc =
3671 "_omp_reduction_list_to_global_copy_func", &M);
3672 LtGCFunc->setAttributes(FuncAttrs);
3673 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3674 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3675 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3676
3677 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3678 Builder.SetInsertPoint(EntryBlock);
3679
3680 // Buffer: global reduction buffer.
3681 Argument *BufferArg = LtGCFunc->getArg(0);
3682 // Idx: index of the buffer.
3683 Argument *IdxArg = LtGCFunc->getArg(1);
3684 // ReduceList: thread local Reduce list.
3685 Argument *ReduceListArg = LtGCFunc->getArg(2);
3686
3687 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3688 BufferArg->getName() + ".addr");
3689 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3690 IdxArg->getName() + ".addr");
3691 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3692 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3693 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3694 BufferArgAlloca, Builder.getPtrTy(),
3695 BufferArgAlloca->getName() + ".ascast");
3696 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3697 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3698 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3699 ReduceListArgAlloca, Builder.getPtrTy(),
3700 ReduceListArgAlloca->getName() + ".ascast");
3701
3702 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3703 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3704 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3705
3706 Value *LocalReduceList =
3707 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3708 Value *BufferArgVal =
3709 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3710 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3711 Type *IndexTy = Builder.getIndexTy(
3712 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3713 for (auto En : enumerate(ReductionInfos)) {
3714 const ReductionInfo &RI = En.value();
3715 auto *RedListArrayTy =
3716 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3717 // Reduce element = LocalReduceList[i]
3718 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3719 RedListArrayTy, LocalReduceList,
3720 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3721 // elemptr = ((CopyType*)(elemptrptr)) + I
3722 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3723
3724 // Global = Buffer.VD[Idx];
3725 Value *BufferVD =
3726 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3727 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3728 ReductionsBufferTy, BufferVD, 0, En.index());
3729
3730 switch (RI.EvaluationKind) {
3731 case EvalKind::Scalar: {
3732 Value *TargetElement;
3733
3734 if (IsByRef.empty() || !IsByRef[En.index()]) {
3735 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3736 } else {
3737 InsertPointOrErrorTy GenResult =
3738 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3739
3740 if (!GenResult)
3741 return GenResult.takeError();
3742
3743 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3744 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3745 }
3746
3747 Builder.CreateStore(TargetElement, GlobVal);
3748 break;
3749 }
3750 case EvalKind::Complex: {
3751 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3752 RI.ElementType, ElemPtr, 0, 0, ".realp");
3753 Value *SrcReal = Builder.CreateLoad(
3754 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3755 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3756 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3757 Value *SrcImg = Builder.CreateLoad(
3758 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3759
3760 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3761 RI.ElementType, GlobVal, 0, 0, ".realp");
3762 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3763 RI.ElementType, GlobVal, 0, 1, ".imagp");
3764 Builder.CreateStore(SrcReal, DestRealPtr);
3765 Builder.CreateStore(SrcImg, DestImgPtr);
3766 break;
3767 }
3768 case EvalKind::Aggregate: {
3769 Value *SizeVal =
3770 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3771 Builder.CreateMemCpy(
3772 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3773 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3774 break;
3775 }
3776 }
3777 }
3778
3779 Builder.CreateRetVoid();
3780 Builder.restoreIP(OldIP);
3781 return LtGCFunc;
3782}
3783
3784Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3785 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3786 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3787 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3788 LLVMContext &Ctx = M.getContext();
3789 FunctionType *FuncTy = FunctionType::get(
3790 Builder.getVoidTy(),
3791 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3792 /* IsVarArg */ false);
3793 Function *LtGRFunc =
3795 "_omp_reduction_list_to_global_reduce_func", &M);
3796 LtGRFunc->setAttributes(FuncAttrs);
3797 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3798 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3799 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3800
3801 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3802 Builder.SetInsertPoint(EntryBlock);
3803
3804 // Buffer: global reduction buffer.
3805 Argument *BufferArg = LtGRFunc->getArg(0);
3806 // Idx: index of the buffer.
3807 Argument *IdxArg = LtGRFunc->getArg(1);
3808 // ReduceList: thread local Reduce list.
3809 Argument *ReduceListArg = LtGRFunc->getArg(2);
3810
3811 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3812 BufferArg->getName() + ".addr");
3813 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3814 IdxArg->getName() + ".addr");
3815 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3816 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3817 auto *RedListArrayTy =
3818 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3819
3820 // 1. Build a list of reduction variables.
3821 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3822 Value *LocalReduceList =
3823 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3824
3825 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3826
3827 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3828 BufferArgAlloca, Builder.getPtrTy(),
3829 BufferArgAlloca->getName() + ".ascast");
3830 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3831 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3832 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3833 ReduceListArgAlloca, Builder.getPtrTy(),
3834 ReduceListArgAlloca->getName() + ".ascast");
3835 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3836 LocalReduceList, Builder.getPtrTy(),
3837 LocalReduceList->getName() + ".ascast");
3838
3839 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3840 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3841 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3842
3843 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3844 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3845 Type *IndexTy = Builder.getIndexTy(
3846 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3847 for (auto En : enumerate(ReductionInfos)) {
3848 const ReductionInfo &RI = En.value();
3849 Value *ByRefAlloc;
3850
3851 if (!IsByRef.empty() && IsByRef[En.index()]) {
3852 InsertPointTy OldIP = Builder.saveIP();
3853 Builder.restoreIP(AllocaIP);
3854
3855 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3856 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3857 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3858
3859 Builder.restoreIP(OldIP);
3860 }
3861
3862 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3863 RedListArrayTy, LocalReduceListAddrCast,
3864 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3865 Value *BufferVD =
3866 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3867 // Global = Buffer.VD[Idx];
3868 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3869 ReductionsBufferTy, BufferVD, 0, En.index());
3870
3871 if (!IsByRef.empty() && IsByRef[En.index()]) {
3872 // Get source descriptor from the reduce list argument
3873 Value *ReduceList =
3874 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3875 Value *SrcElementPtrPtr =
3876 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3877 {ConstantInt::get(IndexTy, 0),
3878 ConstantInt::get(IndexTy, En.index())});
3879 Value *SrcDescriptorAddr =
3880 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3881
3882 // Copy descriptor from source and update base_ptr to global buffer data
3883 InsertPointOrErrorTy GenResult =
3884 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3885 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3886
3887 if (!GenResult)
3888 return GenResult.takeError();
3889
3890 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3891 } else {
3892 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3893 }
3894 }
3895
3896 // Call reduce_function(GlobalReduceList, ReduceList)
3897 Value *ReduceList =
3898 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3899 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3900 ->addFnAttr(Attribute::NoUnwind);
3901 Builder.CreateRetVoid();
3902 Builder.restoreIP(OldIP);
3903 return LtGRFunc;
3904}
3905
3906Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3907 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3908 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3909 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3910 LLVMContext &Ctx = M.getContext();
3911 FunctionType *FuncTy = FunctionType::get(
3912 Builder.getVoidTy(),
3913 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3914 /* IsVarArg */ false);
3915 Function *GtLCFunc =
3917 "_omp_reduction_global_to_list_copy_func", &M);
3918 GtLCFunc->setAttributes(FuncAttrs);
3919 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3920 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3921 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3922
3923 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3924 Builder.SetInsertPoint(EntryBlock);
3925
3926 // Buffer: global reduction buffer.
3927 Argument *BufferArg = GtLCFunc->getArg(0);
3928 // Idx: index of the buffer.
3929 Argument *IdxArg = GtLCFunc->getArg(1);
3930 // ReduceList: thread local Reduce list.
3931 Argument *ReduceListArg = GtLCFunc->getArg(2);
3932
3933 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3934 BufferArg->getName() + ".addr");
3935 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3936 IdxArg->getName() + ".addr");
3937 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3938 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3939 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3940 BufferArgAlloca, Builder.getPtrTy(),
3941 BufferArgAlloca->getName() + ".ascast");
3942 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3943 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3944 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3945 ReduceListArgAlloca, Builder.getPtrTy(),
3946 ReduceListArgAlloca->getName() + ".ascast");
3947 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3948 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3949 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3950
3951 Value *LocalReduceList =
3952 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3953 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3954 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3955 Type *IndexTy = Builder.getIndexTy(
3956 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3957 for (auto En : enumerate(ReductionInfos)) {
3958 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3959 auto *RedListArrayTy =
3960 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3961 // Reduce element = LocalReduceList[i]
3962 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3963 RedListArrayTy, LocalReduceList,
3964 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3965 // elemptr = ((CopyType*)(elemptrptr)) + I
3966 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3967 // Global = Buffer.VD[Idx];
3968 Value *BufferVD =
3969 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3970 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3971 ReductionsBufferTy, BufferVD, 0, En.index());
3972
3973 switch (RI.EvaluationKind) {
3974 case EvalKind::Scalar: {
3975 Type *ElemType = RI.ElementType;
3976
3977 if (!IsByRef.empty() && IsByRef[En.index()]) {
3978 ElemType = RI.ByRefElementType;
3979 InsertPointOrErrorTy GenResult =
3980 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3981
3982 if (!GenResult)
3983 return GenResult.takeError();
3984
3985 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3986 }
3987
3988 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3989 Builder.CreateStore(TargetElement, ElemPtr);
3990 break;
3991 }
3992 case EvalKind::Complex: {
3993 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3994 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3995 Value *SrcReal = Builder.CreateLoad(
3996 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3997 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3998 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3999 Value *SrcImg = Builder.CreateLoad(
4000 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4001
4002 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4003 RI.ElementType, ElemPtr, 0, 0, ".realp");
4004 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4005 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4006 Builder.CreateStore(SrcReal, DestRealPtr);
4007 Builder.CreateStore(SrcImg, DestImgPtr);
4008 break;
4009 }
4010 case EvalKind::Aggregate: {
4011 Value *SizeVal =
4012 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4013 Builder.CreateMemCpy(
4014 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4015 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4016 SizeVal, false);
4017 break;
4018 }
4019 }
4020 }
4021
4022 Builder.CreateRetVoid();
4023 Builder.restoreIP(OldIP);
4024 return GtLCFunc;
4025}
4026
4027Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4028 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4029 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4030 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4031 LLVMContext &Ctx = M.getContext();
4032 auto *FuncTy = FunctionType::get(
4033 Builder.getVoidTy(),
4034 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4035 /* IsVarArg */ false);
4036 Function *GtLRFunc =
4038 "_omp_reduction_global_to_list_reduce_func", &M);
4039 GtLRFunc->setAttributes(FuncAttrs);
4040 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4041 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4042 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4043
4044 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4045 Builder.SetInsertPoint(EntryBlock);
4046
4047 // Buffer: global reduction buffer.
4048 Argument *BufferArg = GtLRFunc->getArg(0);
4049 // Idx: index of the buffer.
4050 Argument *IdxArg = GtLRFunc->getArg(1);
4051 // ReduceList: thread local Reduce list.
4052 Argument *ReduceListArg = GtLRFunc->getArg(2);
4053
4054 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4055 BufferArg->getName() + ".addr");
4056 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4057 IdxArg->getName() + ".addr");
4058 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4059 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4060 ArrayType *RedListArrayTy =
4061 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4062
4063 // 1. Build a list of reduction variables.
4064 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4065 Value *LocalReduceList =
4066 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4067
4068 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4069
4070 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4071 BufferArgAlloca, Builder.getPtrTy(),
4072 BufferArgAlloca->getName() + ".ascast");
4073 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4074 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4075 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4076 ReduceListArgAlloca, Builder.getPtrTy(),
4077 ReduceListArgAlloca->getName() + ".ascast");
4078 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4079 LocalReduceList, Builder.getPtrTy(),
4080 LocalReduceList->getName() + ".ascast");
4081
4082 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4083 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4084 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4085
4086 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4087 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4088 Type *IndexTy = Builder.getIndexTy(
4089 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4090 for (auto En : enumerate(ReductionInfos)) {
4091 const ReductionInfo &RI = En.value();
4092 Value *ByRefAlloc;
4093
4094 if (!IsByRef.empty() && IsByRef[En.index()]) {
4095 InsertPointTy OldIP = Builder.saveIP();
4096 Builder.restoreIP(AllocaIP);
4097
4098 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4099 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4100 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4101
4102 Builder.restoreIP(OldIP);
4103 }
4104
4105 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4106 RedListArrayTy, ReductionList,
4107 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4108 // Global = Buffer.VD[Idx];
4109 Value *BufferVD =
4110 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4111 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4112 ReductionsBufferTy, BufferVD, 0, En.index());
4113
4114 if (!IsByRef.empty() && IsByRef[En.index()]) {
4115 // Get source descriptor from the reduce list
4116 Value *ReduceListVal =
4117 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4118 Value *SrcElementPtrPtr =
4119 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4120 {ConstantInt::get(IndexTy, 0),
4121 ConstantInt::get(IndexTy, En.index())});
4122 Value *SrcDescriptorAddr =
4123 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4124
4125 // Copy descriptor from source and update base_ptr to global buffer data
4126 InsertPointOrErrorTy GenResult =
4127 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4128 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4129 if (!GenResult)
4130 return GenResult.takeError();
4131
4132 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4133 } else {
4134 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4135 }
4136 }
4137
4138 // Call reduce_function(ReduceList, GlobalReduceList)
4139 Value *ReduceList =
4140 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4141 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4142 ->addFnAttr(Attribute::NoUnwind);
4143 Builder.CreateRetVoid();
4144 Builder.restoreIP(OldIP);
4145 return GtLRFunc;
4146}
4147
4148std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4149 std::string Suffix =
4150 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4151 return (Name + Suffix).str();
4152}
4153
4154Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4155 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4157 AttributeList FuncAttrs) {
4158 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4159 {Builder.getPtrTy(), Builder.getPtrTy()},
4160 /* IsVarArg */ false);
4161 std::string Name = getReductionFuncName(ReducerName);
4162 Function *ReductionFunc =
4164 ReductionFunc->setAttributes(FuncAttrs);
4165 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4166 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4167 BasicBlock *EntryBB =
4168 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4169 Builder.SetInsertPoint(EntryBB);
4170
4171 // Need to alloca memory here and deal with the pointers before getting
4172 // LHS/RHS pointers out
4173 Value *LHSArrayPtr = nullptr;
4174 Value *RHSArrayPtr = nullptr;
4175 Argument *Arg0 = ReductionFunc->getArg(0);
4176 Argument *Arg1 = ReductionFunc->getArg(1);
4177 Type *Arg0Type = Arg0->getType();
4178 Type *Arg1Type = Arg1->getType();
4179
4180 Value *LHSAlloca =
4181 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4182 Value *RHSAlloca =
4183 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4184 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4185 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4186 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4187 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4188 Builder.CreateStore(Arg0, LHSAddrCast);
4189 Builder.CreateStore(Arg1, RHSAddrCast);
4190 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4191 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4192
4193 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4194 Type *IndexTy = Builder.getIndexTy(
4195 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4196 SmallVector<Value *> LHSPtrs, RHSPtrs;
4197 for (auto En : enumerate(ReductionInfos)) {
4198 const ReductionInfo &RI = En.value();
4199 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4200 RedArrayTy, RHSArrayPtr,
4201 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4202 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4203 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4204 RHSI8Ptr, RI.PrivateVariable->getType(),
4205 RHSI8Ptr->getName() + ".ascast");
4206
4207 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4208 RedArrayTy, LHSArrayPtr,
4209 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4210 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4211 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4212 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4213
4215 LHSPtrs.emplace_back(LHSPtr);
4216 RHSPtrs.emplace_back(RHSPtr);
4217 } else {
4218 Value *LHS = LHSPtr;
4219 Value *RHS = RHSPtr;
4220
4221 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4222 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4223 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4224 }
4225
4226 Value *Reduced;
4227 InsertPointOrErrorTy AfterIP =
4228 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4229 if (!AfterIP)
4230 return AfterIP.takeError();
4231 if (!Builder.GetInsertBlock())
4232 return ReductionFunc;
4233
4234 Builder.restoreIP(*AfterIP);
4235
4236 if (!IsByRef.empty() && !IsByRef[En.index()])
4237 Builder.CreateStore(Reduced, LHSPtr);
4238 }
4239 }
4240
4242 for (auto En : enumerate(ReductionInfos)) {
4243 unsigned Index = En.index();
4244 const ReductionInfo &RI = En.value();
4245 Value *LHSFixupPtr, *RHSFixupPtr;
4246 Builder.restoreIP(RI.ReductionGenClang(
4247 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4248
4249 // Fix the CallBack code genereated to use the correct Values for the LHS
4250 // and RHS
4251 LHSFixupPtr->replaceUsesWithIf(
4252 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4253 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4254 ReductionFunc;
4255 });
4256 RHSFixupPtr->replaceUsesWithIf(
4257 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4258 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4259 ReductionFunc;
4260 });
4261 }
4262
4263 Builder.CreateRetVoid();
4264 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4265 // to the entry block (this is dones for higher opt levels by later passes in
4266 // the pipeline). This has caused issues because non-entry `alloca`s force the
4267 // function to use dynamic stack allocations and we might run out of scratch
4268 // memory.
4269 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4270
4271 return ReductionFunc;
4272}
4273
4274static void
4276 bool IsGPU) {
4277 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4278 (void)RI;
4279 assert(RI.Variable && "expected non-null variable");
4280 assert(RI.PrivateVariable && "expected non-null private variable");
4281 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4282 "expected non-null reduction generator callback");
4283 if (!IsGPU) {
4284 assert(
4285 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4286 "expected variables and their private equivalents to have the same "
4287 "type");
4288 }
4289 assert(RI.Variable->getType()->isPointerTy() &&
4290 "expected variables to be pointers");
4291 }
4292}
4293
4295 const LocationDescription &Loc, InsertPointTy AllocaIP,
4296 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4297 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4298 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4299 unsigned ReductionBufNum, Value *SrcLocInfo) {
4300 if (!updateToLocation(Loc))
4301 return InsertPointTy();
4302 Builder.restoreIP(CodeGenIP);
4303 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4304 LLVMContext &Ctx = M.getContext();
4305
4306 // Source location for the ident struct
4307 if (!SrcLocInfo) {
4308 uint32_t SrcLocStrSize;
4309 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4310 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4311 }
4312
4313 if (ReductionInfos.size() == 0)
4314 return Builder.saveIP();
4315
4316 BasicBlock *ContinuationBlock = nullptr;
4318 // Copied code from createReductions
4319 BasicBlock *InsertBlock = Loc.IP.getBlock();
4320 ContinuationBlock =
4321 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4322 InsertBlock->getTerminator()->eraseFromParent();
4323 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4324 }
4325
4326 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4327 AttributeList FuncAttrs;
4328 AttrBuilder AttrBldr(Ctx);
4329 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4330 AttrBldr.addAttribute(Attr);
4331 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4332 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4333
4334 CodeGenIP = Builder.saveIP();
4335 Expected<Function *> ReductionResult = createReductionFunction(
4336 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4337 ReductionGenCBKind, FuncAttrs);
4338 if (!ReductionResult)
4339 return ReductionResult.takeError();
4340 Function *ReductionFunc = *ReductionResult;
4341 Builder.restoreIP(CodeGenIP);
4342
4343 // Set the grid value in the config needed for lowering later on
4344 if (GridValue.has_value())
4345 Config.setGridValue(GridValue.value());
4346 else
4347 Config.setGridValue(getGridValue(T, ReductionFunc));
4348
4349 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4350 // RedList, shuffle_reduce_func, interwarp_copy_func);
4351 // or
4352 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4353 Value *Res;
4354
4355 // 1. Build a list of reduction variables.
4356 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4357 auto Size = ReductionInfos.size();
4358 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4359 Type *FuncPtrTy =
4360 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4361 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4362 CodeGenIP = Builder.saveIP();
4363 Builder.restoreIP(AllocaIP);
4364 Value *ReductionListAlloca =
4365 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4366 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4367 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4368 Builder.restoreIP(CodeGenIP);
4369 Type *IndexTy = Builder.getIndexTy(
4370 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4371 for (auto En : enumerate(ReductionInfos)) {
4372 const ReductionInfo &RI = En.value();
4373 Value *ElemPtr = Builder.CreateInBoundsGEP(
4374 RedArrayTy, ReductionList,
4375 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4376
4377 Value *PrivateVar = RI.PrivateVariable;
4378 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4379 if (IsByRefElem)
4380 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4381
4382 Value *CastElem =
4383 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4384 Builder.CreateStore(CastElem, ElemPtr);
4385 }
4386 CodeGenIP = Builder.saveIP();
4387 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4388 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4389
4390 if (!SarFunc)
4391 return SarFunc.takeError();
4392
4393 Expected<Function *> CopyResult =
4394 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4395 if (!CopyResult)
4396 return CopyResult.takeError();
4397 Function *WcFunc = *CopyResult;
4398 Builder.restoreIP(CodeGenIP);
4399
4400 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4401
4402 // NOTE: ReductionDataSize is passed as the reduce_data_size
4403 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4404 // the runtime implementations do not currently use it. The teams
4405 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4406 // (set separately via TargetKernelDefaultAttrs). It is computed
4407 // here conservatively as max(element sizes) * N rather than the
4408 // exact sum, which over-calculates the size for mixed reduction
4409 // types but is harmless given the argument is unused.
4410 // TODO: Consider dropping this computation if the runtime API is
4411 // ever revised to remove the unused parameter.
4412 unsigned MaxDataSize = 0;
4413 SmallVector<Type *> ReductionTypeArgs;
4414 for (auto En : enumerate(ReductionInfos)) {
4415 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4416 // the actual data size stored in the global reduction buffer, consistent
4417 // with the ReductionsBufferTy struct used for GEP offsets below.
4418 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4419 ? En.value().ByRefElementType
4420 : En.value().ElementType;
4421 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4422 if (Size > MaxDataSize)
4423 MaxDataSize = Size;
4424 ReductionTypeArgs.emplace_back(RedTypeArg);
4425 }
4426 Value *ReductionDataSize =
4427 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4428 if (!IsTeamsReduction) {
4429 Value *SarFuncCast =
4430 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4431 Value *WcFuncCast =
4432 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4433 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4434 WcFuncCast};
4436 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4437 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4438 } else {
4439 CodeGenIP = Builder.saveIP();
4440 StructType *ReductionsBufferTy = StructType::create(
4441 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4442 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4443 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4444
4445 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4446 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4447 if (!LtGCFunc)
4448 return LtGCFunc.takeError();
4449
4450 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4451 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4452 if (!LtGRFunc)
4453 return LtGRFunc.takeError();
4454
4455 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4456 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4457 if (!GtLCFunc)
4458 return GtLCFunc.takeError();
4459
4460 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4461 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4462 if (!GtLRFunc)
4463 return GtLRFunc.takeError();
4464
4465 Builder.restoreIP(CodeGenIP);
4466
4467 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4468 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4469
4470 Value *Args3[] = {SrcLocInfo,
4471 KernelTeamsReductionPtr,
4472 Builder.getInt32(ReductionBufNum),
4473 ReductionDataSize,
4474 RL,
4475 *SarFunc,
4476 WcFunc,
4477 *LtGCFunc,
4478 *LtGRFunc,
4479 *GtLCFunc,
4480 *GtLRFunc};
4481
4482 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4483 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4484 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4485 }
4486
4487 // 5. Build if (res == 1)
4488 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4489 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4490 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4491 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4492
4493 // 6. Build then branch: where we have reduced values in the master
4494 // thread in each team.
4495 // __kmpc_end_reduce{_nowait}(<gtid>);
4496 // break;
4497 emitBlock(ThenBB, CurFunc);
4498
4499 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4500 for (auto En : enumerate(ReductionInfos)) {
4501 const ReductionInfo &RI = En.value();
4503 Value *RedValue = RI.Variable;
4504 Value *RHS =
4505 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4506
4508 Value *LHSPtr, *RHSPtr;
4509 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4510 &LHSPtr, &RHSPtr, CurFunc));
4511
4512 // Fix the CallBack code genereated to use the correct Values for the LHS
4513 // and RHS
4514 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4515 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4516 ReductionFunc;
4517 });
4518 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4519 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4520 ReductionFunc;
4521 });
4522 } else {
4523 if (IsByRef.empty() || !IsByRef[En.index()]) {
4524 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4525 "red.value." + Twine(En.index()));
4526 }
4527 Value *PrivateRedValue = Builder.CreateLoad(
4528 ValueType, RHS, "red.private.value" + Twine(En.index()));
4529 Value *Reduced;
4530 InsertPointOrErrorTy AfterIP =
4531 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4532 if (!AfterIP)
4533 return AfterIP.takeError();
4534 Builder.restoreIP(*AfterIP);
4535
4536 if (!IsByRef.empty() && !IsByRef[En.index()])
4537 Builder.CreateStore(Reduced, RI.Variable);
4538 }
4539 }
4540 emitBlock(ExitBB, CurFunc);
4541 if (ContinuationBlock) {
4542 Builder.CreateBr(ContinuationBlock);
4543 Builder.SetInsertPoint(ContinuationBlock);
4544 }
4545 Config.setEmitLLVMUsed();
4546
4547 return Builder.saveIP();
4548}
4549
4551 Type *VoidTy = Type::getVoidTy(M.getContext());
4552 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4553 auto *FuncTy =
4554 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4556 ".omp.reduction.func", &M);
4557}
4558
4560 Function *ReductionFunc,
4562 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4563 Module *Module = ReductionFunc->getParent();
4564 BasicBlock *ReductionFuncBlock =
4565 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4566 Builder.SetInsertPoint(ReductionFuncBlock);
4567 Value *LHSArrayPtr = nullptr;
4568 Value *RHSArrayPtr = nullptr;
4569 if (IsGPU) {
4570 // Need to alloca memory here and deal with the pointers before getting
4571 // LHS/RHS pointers out
4572 //
4573 Argument *Arg0 = ReductionFunc->getArg(0);
4574 Argument *Arg1 = ReductionFunc->getArg(1);
4575 Type *Arg0Type = Arg0->getType();
4576 Type *Arg1Type = Arg1->getType();
4577
4578 Value *LHSAlloca =
4579 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4580 Value *RHSAlloca =
4581 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4582 Value *LHSAddrCast =
4583 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4584 Value *RHSAddrCast =
4585 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4586 Builder.CreateStore(Arg0, LHSAddrCast);
4587 Builder.CreateStore(Arg1, RHSAddrCast);
4588 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4589 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4590 } else {
4591 LHSArrayPtr = ReductionFunc->getArg(0);
4592 RHSArrayPtr = ReductionFunc->getArg(1);
4593 }
4594
4595 unsigned NumReductions = ReductionInfos.size();
4596 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4597
4598 for (auto En : enumerate(ReductionInfos)) {
4599 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4600 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4601 RedArrayTy, LHSArrayPtr, 0, En.index());
4602 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4603 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4604 LHSI8Ptr, RI.Variable->getType());
4605 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4606 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4607 RedArrayTy, RHSArrayPtr, 0, En.index());
4608 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4609 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4610 RHSI8Ptr, RI.PrivateVariable->getType());
4611 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4612 Value *Reduced;
4614 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4615 if (!AfterIP)
4616 return AfterIP.takeError();
4617
4618 Builder.restoreIP(*AfterIP);
4619 // TODO: Consider flagging an error.
4620 if (!Builder.GetInsertBlock())
4621 return Error::success();
4622
4623 // store is inside of the reduction region when using by-ref
4624 if (!IsByRef[En.index()])
4625 Builder.CreateStore(Reduced, LHSPtr);
4626 }
4627 Builder.CreateRetVoid();
4628 return Error::success();
4629}
4630
4632 const LocationDescription &Loc, InsertPointTy AllocaIP,
4633 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4634 bool IsNoWait, bool IsTeamsReduction) {
4635 assert(ReductionInfos.size() == IsByRef.size());
4636 if (Config.isGPU())
4637 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4638 IsByRef, IsNoWait, IsTeamsReduction);
4639
4640 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4641
4642 if (!updateToLocation(Loc))
4643 return InsertPointTy();
4644
4645 if (ReductionInfos.size() == 0)
4646 return Builder.saveIP();
4647
4648 BasicBlock *InsertBlock = Loc.IP.getBlock();
4649 BasicBlock *ContinuationBlock =
4650 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4651 InsertBlock->getTerminator()->eraseFromParent();
4652
4653 // Create and populate array of type-erased pointers to private reduction
4654 // values.
4655 unsigned NumReductions = ReductionInfos.size();
4656 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4657 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4658 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4659
4660 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4661
4662 for (auto En : enumerate(ReductionInfos)) {
4663 unsigned Index = En.index();
4664 const ReductionInfo &RI = En.value();
4665 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4666 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4667 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4668 }
4669
4670 // Emit a call to the runtime function that orchestrates the reduction.
4671 // Declare the reduction function in the process.
4672 Type *IndexTy = Builder.getIndexTy(
4673 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4674 Function *Func = Builder.GetInsertBlock()->getParent();
4675 Module *Module = Func->getParent();
4676 uint32_t SrcLocStrSize;
4677 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4678 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4679 return RI.AtomicReductionGen;
4680 });
4681 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4682 CanGenerateAtomic
4683 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4684 : IdentFlag(0));
4685 Value *ThreadId = getOrCreateThreadID(Ident);
4686 Constant *NumVariables = Builder.getInt32(NumReductions);
4687 const DataLayout &DL = Module->getDataLayout();
4688 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4689 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4690 Function *ReductionFunc = getFreshReductionFunc(*Module);
4691 Value *Lock = getOMPCriticalRegionLock(".reduction");
4693 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4694 : RuntimeFunction::OMPRTL___kmpc_reduce);
4695 CallInst *ReduceCall =
4696 createRuntimeFunctionCall(ReduceFunc,
4697 {Ident, ThreadId, NumVariables, RedArraySize,
4698 RedArray, ReductionFunc, Lock},
4699 "reduce");
4700
4701 // Create final reduction entry blocks for the atomic and non-atomic case.
4702 // Emit IR that dispatches control flow to one of the blocks based on the
4703 // reduction supporting the atomic mode.
4704 BasicBlock *NonAtomicRedBlock =
4705 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4706 BasicBlock *AtomicRedBlock =
4707 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4708 SwitchInst *Switch =
4709 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4710 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4711 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4712
4713 // Populate the non-atomic reduction using the elementwise reduction function.
4714 // This loads the elements from the global and private variables and reduces
4715 // them before storing back the result to the global variable.
4716 Builder.SetInsertPoint(NonAtomicRedBlock);
4717 for (auto En : enumerate(ReductionInfos)) {
4718 const ReductionInfo &RI = En.value();
4720 // We have one less load for by-ref case because that load is now inside of
4721 // the reduction region
4722 Value *RedValue = RI.Variable;
4723 if (!IsByRef[En.index()]) {
4724 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4725 "red.value." + Twine(En.index()));
4726 }
4727 Value *PrivateRedValue =
4728 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4729 "red.private.value." + Twine(En.index()));
4730 Value *Reduced;
4731 InsertPointOrErrorTy AfterIP =
4732 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4733 if (!AfterIP)
4734 return AfterIP.takeError();
4735 Builder.restoreIP(*AfterIP);
4736
4737 if (!Builder.GetInsertBlock())
4738 return InsertPointTy();
4739 // for by-ref case, the load is inside of the reduction region
4740 if (!IsByRef[En.index()])
4741 Builder.CreateStore(Reduced, RI.Variable);
4742 }
4743 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4744 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4745 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4746 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4747 Builder.CreateBr(ContinuationBlock);
4748
4749 // Populate the atomic reduction using the atomic elementwise reduction
4750 // function. There are no loads/stores here because they will be happening
4751 // inside the atomic elementwise reduction.
4752 Builder.SetInsertPoint(AtomicRedBlock);
4753 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4754 for (const ReductionInfo &RI : ReductionInfos) {
4756 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4757 if (!AfterIP)
4758 return AfterIP.takeError();
4759 Builder.restoreIP(*AfterIP);
4760 if (!Builder.GetInsertBlock())
4761 return InsertPointTy();
4762 }
4763 Builder.CreateBr(ContinuationBlock);
4764 } else {
4765 Builder.CreateUnreachable();
4766 }
4767
4768 // Populate the outlined reduction function using the elementwise reduction
4769 // function. Partial values are extracted from the type-erased array of
4770 // pointers to private variables.
4771 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4772 IsByRef, /*isGPU=*/false);
4773 if (Err)
4774 return Err;
4775
4776 if (!Builder.GetInsertBlock())
4777 return InsertPointTy();
4778
4779 Builder.SetInsertPoint(ContinuationBlock);
4780 return Builder.saveIP();
4781}
4782
4785 BodyGenCallbackTy BodyGenCB,
4786 FinalizeCallbackTy FiniCB) {
4787 if (!updateToLocation(Loc))
4788 return Loc.IP;
4789
4790 Directive OMPD = Directive::OMPD_master;
4791 uint32_t SrcLocStrSize;
4792 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4793 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4794 Value *ThreadId = getOrCreateThreadID(Ident);
4795 Value *Args[] = {Ident, ThreadId};
4796
4797 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4798 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4799
4800 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4801 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4802
4803 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4804 /*Conditional*/ true, /*hasFinalize*/ true);
4805}
4806
4809 BodyGenCallbackTy BodyGenCB,
4810 FinalizeCallbackTy FiniCB, Value *Filter) {
4811 if (!updateToLocation(Loc))
4812 return Loc.IP;
4813
4814 Directive OMPD = Directive::OMPD_masked;
4815 uint32_t SrcLocStrSize;
4816 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4817 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4818 Value *ThreadId = getOrCreateThreadID(Ident);
4819 Value *Args[] = {Ident, ThreadId, Filter};
4820 Value *ArgsEnd[] = {Ident, ThreadId};
4821
4822 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4823 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4824
4825 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4826 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4827
4828 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4829 /*Conditional*/ true, /*hasFinalize*/ true);
4830}
4831
4833 llvm::FunctionCallee Callee,
4835 const llvm::Twine &Name) {
4836 llvm::CallInst *Call = Builder.CreateCall(
4837 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4838 Call->setDoesNotThrow();
4839 return Call;
4840}
4841
4842// Expects input basic block is dominated by BeforeScanBB.
4843// Once Scan directive is encountered, the code after scan directive should be
4844// dominated by AfterScanBB. Scan directive splits the code sequence to
4845// scan and input phase. Based on whether inclusive or exclusive
4846// clause is used in the scan directive and whether input loop or scan loop
4847// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4848// input loop and second is the scan loop. The code generated handles only
4849// inclusive scans now.
4851 const LocationDescription &Loc, InsertPointTy AllocaIP,
4852 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4853 bool IsInclusive, ScanInfo *ScanRedInfo) {
4854 if (ScanRedInfo->OMPFirstScanLoop) {
4855 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4856 ScanVarsType, ScanRedInfo);
4857 if (Err)
4858 return Err;
4859 }
4860 if (!updateToLocation(Loc))
4861 return Loc.IP;
4862
4863 llvm::Value *IV = ScanRedInfo->IV;
4864
4865 if (ScanRedInfo->OMPFirstScanLoop) {
4866 // Emit buffer[i] = red; at the end of the input phase.
4867 for (size_t i = 0; i < ScanVars.size(); i++) {
4868 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4869 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4870 Type *DestTy = ScanVarsType[i];
4871 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4872 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4873
4874 Builder.CreateStore(Src, Val);
4875 }
4876 }
4877 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4878 emitBlock(ScanRedInfo->OMPScanDispatch,
4879 Builder.GetInsertBlock()->getParent());
4880
4881 if (!ScanRedInfo->OMPFirstScanLoop) {
4882 IV = ScanRedInfo->IV;
4883 // Emit red = buffer[i]; at the entrance to the scan phase.
4884 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4885 for (size_t i = 0; i < ScanVars.size(); i++) {
4886 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4887 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4888 Type *DestTy = ScanVarsType[i];
4889 Value *SrcPtr =
4890 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4891 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4892 Builder.CreateStore(Src, ScanVars[i]);
4893 }
4894 }
4895
4896 // TODO: Update it to CreateBr and remove dead blocks
4897 llvm::Value *CmpI = Builder.getInt1(true);
4898 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4899 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4900 ScanRedInfo->OMPAfterScanBlock);
4901 } else {
4902 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4903 ScanRedInfo->OMPBeforeScanBlock);
4904 }
4905 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4906 Builder.GetInsertBlock()->getParent());
4907 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4908 return Builder.saveIP();
4909}
4910
4911Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4912 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4913 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4914
4915 Builder.restoreIP(AllocaIP);
4916 // Create the shared pointer at alloca IP.
4917 for (size_t i = 0; i < ScanVars.size(); i++) {
4918 llvm::Value *BuffPtr =
4919 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4920 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4921 }
4922
4923 // Allocate temporary buffer by master thread
4924 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4925 InsertPointTy CodeGenIP) -> Error {
4926 Builder.restoreIP(CodeGenIP);
4927 Value *AllocSpan =
4928 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4929 for (size_t i = 0; i < ScanVars.size(); i++) {
4930 Type *IntPtrTy = Builder.getInt32Ty();
4931 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4932 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4933 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4934 AllocSpan, nullptr, "arr");
4935 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4936 }
4937 return Error::success();
4938 };
4939 // TODO: Perform finalization actions for variables. This has to be
4940 // called for variables which have destructors/finalizers.
4941 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4942
4943 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4944 llvm::Value *FilterVal = Builder.getInt32(0);
4946 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4947
4948 if (!AfterIP)
4949 return AfterIP.takeError();
4950 Builder.restoreIP(*AfterIP);
4951 BasicBlock *InputBB = Builder.GetInsertBlock();
4952 if (InputBB->hasTerminator())
4953 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4954 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4955 if (!AfterIP)
4956 return AfterIP.takeError();
4957 Builder.restoreIP(*AfterIP);
4958
4959 return Error::success();
4960}
4961
4962Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4963 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4964 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4965 InsertPointTy CodeGenIP) -> Error {
4966 Builder.restoreIP(CodeGenIP);
4967 for (ReductionInfo RedInfo : ReductionInfos) {
4968 Value *PrivateVar = RedInfo.PrivateVariable;
4969 Value *OrigVar = RedInfo.Variable;
4970 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4971 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4972
4973 Type *SrcTy = RedInfo.ElementType;
4974 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4975 "arrayOffset");
4976 Value *Src = Builder.CreateLoad(SrcTy, Val);
4977
4978 Builder.CreateStore(Src, OrigVar);
4979 Builder.CreateFree(Buff);
4980 }
4981 return Error::success();
4982 };
4983 // TODO: Perform finalization actions for variables. This has to be
4984 // called for variables which have destructors/finalizers.
4985 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4986
4987 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
4988 Builder.SetInsertPoint(TI);
4989 else
4990 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4991
4992 llvm::Value *FilterVal = Builder.getInt32(0);
4994 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4995
4996 if (!AfterIP)
4997 return AfterIP.takeError();
4998 Builder.restoreIP(*AfterIP);
4999 BasicBlock *InputBB = Builder.GetInsertBlock();
5000 if (InputBB->hasTerminator())
5001 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5002 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5003 if (!AfterIP)
5004 return AfterIP.takeError();
5005 Builder.restoreIP(*AfterIP);
5006 return Error::success();
5007}
5008
5010 const LocationDescription &Loc,
5012 ScanInfo *ScanRedInfo) {
5013
5014 if (!updateToLocation(Loc))
5015 return Loc.IP;
5016 auto BodyGenCB = [&](InsertPointTy AllocaIP,
5017 InsertPointTy CodeGenIP) -> Error {
5018 Builder.restoreIP(CodeGenIP);
5019 Function *CurFn = Builder.GetInsertBlock()->getParent();
5020 // for (int k = 0; k <= ceil(log2(n)); ++k)
5021 llvm::BasicBlock *LoopBB =
5022 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5023 llvm::BasicBlock *ExitBB =
5024 splitBB(Builder, false, "omp.outer.log.scan.exit");
5026 Builder.GetInsertBlock()->getModule(),
5027 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5028 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5029 llvm::Value *Arg =
5030 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5031 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5033 Builder.GetInsertBlock()->getModule(),
5034 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5035 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5036 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5037 llvm::Value *NMin1 = Builder.CreateNUWSub(
5038 ScanRedInfo->Span,
5039 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5040 Builder.SetInsertPoint(InputBB);
5041 Builder.CreateBr(LoopBB);
5042 emitBlock(LoopBB, CurFn);
5043 Builder.SetInsertPoint(LoopBB);
5044
5045 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5046 // size pow2k = 1;
5047 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5048 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5049 InputBB);
5050 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5051 InputBB);
5052 // for (size i = n - 1; i >= 2 ^ k; --i)
5053 // tmp[i] op= tmp[i-pow2k];
5054 llvm::BasicBlock *InnerLoopBB =
5055 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5056 llvm::BasicBlock *InnerExitBB =
5057 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5058 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5059 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5060 emitBlock(InnerLoopBB, CurFn);
5061 Builder.SetInsertPoint(InnerLoopBB);
5062 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5063 IVal->addIncoming(NMin1, LoopBB);
5064 for (ReductionInfo RedInfo : ReductionInfos) {
5065 Value *ReductionVal = RedInfo.PrivateVariable;
5066 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5067 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5068 Type *DestTy = RedInfo.ElementType;
5069 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5070 Value *LHSPtr =
5071 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5072 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5073 Value *RHSPtr =
5074 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5075 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5076 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5077 llvm::Value *Result;
5078 InsertPointOrErrorTy AfterIP =
5079 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5080 if (!AfterIP)
5081 return AfterIP.takeError();
5082 Builder.CreateStore(Result, LHSPtr);
5083 }
5084 llvm::Value *NextIVal = Builder.CreateNUWSub(
5085 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5086 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5087 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5088 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5089 emitBlock(InnerExitBB, CurFn);
5090 llvm::Value *Next = Builder.CreateNUWAdd(
5091 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5092 Counter->addIncoming(Next, Builder.GetInsertBlock());
5093 // pow2k <<= 1;
5094 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5095 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5096 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5097 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5098 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5099 return Error::success();
5100 };
5101
5102 // TODO: Perform finalization actions for variables. This has to be
5103 // called for variables which have destructors/finalizers.
5104 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5105
5106 llvm::Value *FilterVal = Builder.getInt32(0);
5108 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5109
5110 if (!AfterIP)
5111 return AfterIP.takeError();
5112 Builder.restoreIP(*AfterIP);
5113 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5114
5115 if (!AfterIP)
5116 return AfterIP.takeError();
5117 Builder.restoreIP(*AfterIP);
5118 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5119 if (Err)
5120 return Err;
5121
5122 return AfterIP;
5123}
5124
5125Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5126 llvm::function_ref<Error()> InputLoopGen,
5127 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5128 ScanInfo *ScanRedInfo) {
5129
5130 {
5131 // Emit loop with input phase:
5132 // for (i: 0..<num_iters>) {
5133 // <input phase>;
5134 // buffer[i] = red;
5135 // }
5136 ScanRedInfo->OMPFirstScanLoop = true;
5137 Error Err = InputLoopGen();
5138 if (Err)
5139 return Err;
5140 }
5141 {
5142 // Emit loop with scan phase:
5143 // for (i: 0..<num_iters>) {
5144 // red = buffer[i];
5145 // <scan phase>;
5146 // }
5147 ScanRedInfo->OMPFirstScanLoop = false;
5148 Error Err = ScanLoopGen(Builder.saveIP());
5149 if (Err)
5150 return Err;
5151 }
5152 return Error::success();
5153}
5154
5155void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5156 Function *Fun = Builder.GetInsertBlock()->getParent();
5157 ScanRedInfo->OMPScanDispatch =
5158 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5159 ScanRedInfo->OMPAfterScanBlock =
5160 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5161 ScanRedInfo->OMPBeforeScanBlock =
5162 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5163 ScanRedInfo->OMPScanLoopExit =
5164 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5165}
5167 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5168 BasicBlock *PostInsertBefore, const Twine &Name) {
5169 Module *M = F->getParent();
5170 LLVMContext &Ctx = M->getContext();
5171 Type *IndVarTy = TripCount->getType();
5172
5173 // Create the basic block structure.
5174 BasicBlock *Preheader =
5175 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5176 BasicBlock *Header =
5177 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5178 BasicBlock *Cond =
5179 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5180 BasicBlock *Body =
5181 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5182 BasicBlock *Latch =
5183 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5184 BasicBlock *Exit =
5185 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5186 BasicBlock *After =
5187 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5188
5189 // Use specified DebugLoc for new instructions.
5190 Builder.SetCurrentDebugLocation(DL);
5191
5192 Builder.SetInsertPoint(Preheader);
5193 Builder.CreateBr(Header);
5194
5195 Builder.SetInsertPoint(Header);
5196 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5197 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5198 Builder.CreateBr(Cond);
5199
5200 Builder.SetInsertPoint(Cond);
5201 Value *Cmp =
5202 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5203 Builder.CreateCondBr(Cmp, Body, Exit);
5204
5205 Builder.SetInsertPoint(Body);
5206 Builder.CreateBr(Latch);
5207
5208 Builder.SetInsertPoint(Latch);
5209 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5210 "omp_" + Name + ".next", /*HasNUW=*/true);
5211 Builder.CreateBr(Header);
5212 IndVarPHI->addIncoming(Next, Latch);
5213
5214 Builder.SetInsertPoint(Exit);
5215 Builder.CreateBr(After);
5216
5217 // Remember and return the canonical control flow.
5218 LoopInfos.emplace_front();
5219 CanonicalLoopInfo *CL = &LoopInfos.front();
5220
5221 CL->Header = Header;
5222 CL->Cond = Cond;
5223 CL->Latch = Latch;
5224 CL->Exit = Exit;
5225
5226#ifndef NDEBUG
5227 CL->assertOK();
5228#endif
5229 return CL;
5230}
5231
5234 LoopBodyGenCallbackTy BodyGenCB,
5235 Value *TripCount, const Twine &Name) {
5236 BasicBlock *BB = Loc.IP.getBlock();
5237 BasicBlock *NextBB = BB->getNextNode();
5238
5239 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5240 NextBB, NextBB, Name);
5241 BasicBlock *After = CL->getAfter();
5242
5243 // If location is not set, don't connect the loop.
5244 if (updateToLocation(Loc)) {
5245 // Split the loop at the insertion point: Branch to the preheader and move
5246 // every following instruction to after the loop (the After BB). Also, the
5247 // new successor is the loop's after block.
5248 spliceBB(Builder, After, /*CreateBranch=*/false);
5249 Builder.CreateBr(CL->getPreheader());
5250 }
5251
5252 // Emit the body content. We do it after connecting the loop to the CFG to
5253 // avoid that the callback encounters degenerate BBs.
5254 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5255 return Err;
5256
5257#ifndef NDEBUG
5258 CL->assertOK();
5259#endif
5260 return CL;
5261}
5262
5264 ScanInfos.emplace_front();
5265 ScanInfo *Result = &ScanInfos.front();
5266 return Result;
5267}
5268
5272 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5273 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5274 LocationDescription ComputeLoc =
5275 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5276 updateToLocation(ComputeLoc);
5277
5279
5281 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5282 ScanRedInfo->Span = TripCount;
5283 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5284 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5285
5286 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5287 Builder.restoreIP(CodeGenIP);
5288 ScanRedInfo->IV = IV;
5289 createScanBBs(ScanRedInfo);
5290 BasicBlock *InputBlock = Builder.GetInsertBlock();
5291 Instruction *Terminator = InputBlock->getTerminator();
5292 assert(Terminator->getNumSuccessors() == 1);
5293 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5294 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5295 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5296 Builder.GetInsertBlock()->getParent());
5297 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5298 emitBlock(ScanRedInfo->OMPScanLoopExit,
5299 Builder.GetInsertBlock()->getParent());
5300 Builder.CreateBr(ContinueBlock);
5301 Builder.SetInsertPoint(
5302 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5303 return BodyGenCB(Builder.saveIP(), IV);
5304 };
5305
5306 const auto &&InputLoopGen = [&]() -> Error {
5308 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5309 ComputeIP, Name, true, ScanRedInfo);
5310 if (!LoopInfo)
5311 return LoopInfo.takeError();
5312 Result.push_back(*LoopInfo);
5313 Builder.restoreIP((*LoopInfo)->getAfterIP());
5314 return Error::success();
5315 };
5316 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5318 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5319 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5320 if (!LoopInfo)
5321 return LoopInfo.takeError();
5322 Result.push_back(*LoopInfo);
5323 Builder.restoreIP((*LoopInfo)->getAfterIP());
5324 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5325 return Error::success();
5326 };
5327 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5328 if (Err)
5329 return Err;
5330 return Result;
5331}
5332
5334 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5335 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5336
5337 // Consider the following difficulties (assuming 8-bit signed integers):
5338 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5339 // DO I = 1, 100, 50
5340 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5341 // DO I = 100, 0, -128
5342
5343 // Start, Stop and Step must be of the same integer type.
5344 auto *IndVarTy = cast<IntegerType>(Start->getType());
5345 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5346 assert(IndVarTy == Step->getType() && "Step type mismatch");
5347
5349
5350 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5351 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5352
5353 // Like Step, but always positive.
5354 Value *Incr = Step;
5355
5356 // Distance between Start and Stop; always positive.
5357 Value *Span;
5358
5359 // Condition whether there are no iterations are executed at all, e.g. because
5360 // UB < LB.
5361 Value *ZeroCmp;
5362
5363 if (IsSigned) {
5364 // Ensure that increment is positive. If not, negate and invert LB and UB.
5365 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5366 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5367 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5368 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5369 Span = Builder.CreateSub(UB, LB, "", false, true);
5370 ZeroCmp = Builder.CreateICmp(
5371 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5372 } else {
5373 Span = Builder.CreateSub(Stop, Start, "", true);
5374 ZeroCmp = Builder.CreateICmp(
5375 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5376 }
5377
5378 Value *CountIfLooping;
5379 if (InclusiveStop) {
5380 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5381 } else {
5382 // Avoid incrementing past stop since it could overflow.
5383 Value *CountIfTwo = Builder.CreateAdd(
5384 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5385 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5386 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5387 }
5388
5389 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5390 "omp_" + Name + ".tripcount");
5391}
5392
5395 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5396 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5397 ScanInfo *ScanRedInfo) {
5398 LocationDescription ComputeLoc =
5399 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5400
5402 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5403
5404 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5405 Builder.restoreIP(CodeGenIP);
5406 Value *Span = Builder.CreateMul(IV, Step);
5407 Value *IndVar = Builder.CreateAdd(Span, Start);
5408 if (InScan)
5409 ScanRedInfo->IV = IndVar;
5410 return BodyGenCB(Builder.saveIP(), IndVar);
5411 };
5412 LocationDescription LoopLoc =
5413 ComputeIP.isSet()
5414 ? Loc
5415 : LocationDescription(Builder.saveIP(),
5416 Builder.getCurrentDebugLocation());
5417 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5418}
5419
5420// Returns an LLVM function to call for initializing loop bounds using OpenMP
5421// static scheduling for composite `distribute parallel for` depending on
5422// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5423// integers as unsigned similarly to CanonicalLoopInfo.
5424static FunctionCallee
5426 OpenMPIRBuilder &OMPBuilder) {
5427 unsigned Bitwidth = Ty->getIntegerBitWidth();
5428 if (Bitwidth == 32)
5429 return OMPBuilder.getOrCreateRuntimeFunction(
5430 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5431 if (Bitwidth == 64)
5432 return OMPBuilder.getOrCreateRuntimeFunction(
5433 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5434 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5435}
5436
5437// Returns an LLVM function to call for initializing loop bounds using OpenMP
5438// static scheduling depending on `type`. Only i32 and i64 are supported by the
5439// runtime. Always interpret integers as unsigned similarly to
5440// CanonicalLoopInfo.
5442 OpenMPIRBuilder &OMPBuilder) {
5443 unsigned Bitwidth = Ty->getIntegerBitWidth();
5444 if (Bitwidth == 32)
5445 return OMPBuilder.getOrCreateRuntimeFunction(
5446 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5447 if (Bitwidth == 64)
5448 return OMPBuilder.getOrCreateRuntimeFunction(
5449 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5450 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5451}
5452
5453OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5454 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5455 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5456 OMPScheduleType DistScheduleSchedType) {
5457 assert(CLI->isValid() && "Requires a valid canonical loop");
5458 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5459 "Require dedicated allocate IP");
5460
5461 // Set up the source location value for OpenMP runtime.
5462 Builder.restoreIP(CLI->getPreheaderIP());
5463 Builder.SetCurrentDebugLocation(DL);
5464
5465 uint32_t SrcLocStrSize;
5466 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5467 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5468
5469 // Declare useful OpenMP runtime functions.
5470 Value *IV = CLI->getIndVar();
5471 Type *IVTy = IV->getType();
5472 FunctionCallee StaticInit =
5473 LoopType == WorksharingLoopType::DistributeForStaticLoop
5474 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5475 : getKmpcForStaticInitForType(IVTy, M, *this);
5476 FunctionCallee StaticFini =
5477 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5478
5479 // Allocate space for computed loop bounds as expected by the "init" function.
5480 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5481
5482 Type *I32Type = Type::getInt32Ty(M.getContext());
5483 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5484 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5485 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5486 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5487 CLI->setLastIter(PLastIter);
5488
5489 // At the end of the preheader, prepare for calling the "init" function by
5490 // storing the current loop bounds into the allocated space. A canonical loop
5491 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5492 // and produces an inclusive upper bound.
5493 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5494 Constant *Zero = ConstantInt::get(IVTy, 0);
5495 Constant *One = ConstantInt::get(IVTy, 1);
5496 Builder.CreateStore(Zero, PLowerBound);
5497 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5498 Builder.CreateStore(UpperBound, PUpperBound);
5499 Builder.CreateStore(One, PStride);
5500
5501 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5502
5503 OMPScheduleType SchedType =
5504 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5505 ? OMPScheduleType::OrderedDistribute
5507 Constant *SchedulingType =
5508 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5509
5510 // Call the "init" function and update the trip count of the loop with the
5511 // value it produced.
5512 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5513 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5514 this](Value *SchedulingType, auto &Builder) {
5515 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5516 PLowerBound, PUpperBound});
5517 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5518 Value *PDistUpperBound =
5519 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5520 Args.push_back(PDistUpperBound);
5521 }
5522 Args.append({PStride, One, Zero});
5523 createRuntimeFunctionCall(StaticInit, Args);
5524 };
5525 BuildInitCall(SchedulingType, Builder);
5526 if (HasDistSchedule &&
5527 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5528 Constant *DistScheduleSchedType = ConstantInt::get(
5529 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5530 // We want to emit a second init function call for the dist_schedule clause
5531 // to the Distribute construct. This should only be done however if a
5532 // Workshare Loop is nested within a Distribute Construct
5533 BuildInitCall(DistScheduleSchedType, Builder);
5534 }
5535 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5536 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5537 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5538 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5539 CLI->setTripCount(TripCount);
5540
5541 // Update all uses of the induction variable except the one in the condition
5542 // block that compares it with the actual upper bound, and the increment in
5543 // the latch block.
5544
5545 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5546 Builder.SetInsertPoint(CLI->getBody(),
5547 CLI->getBody()->getFirstInsertionPt());
5548 Builder.SetCurrentDebugLocation(DL);
5549 return Builder.CreateAdd(OldIV, LowerBound);
5550 });
5551
5552 // In the "exit" block, call the "fini" function.
5553 Builder.SetInsertPoint(CLI->getExit(),
5554 CLI->getExit()->getTerminator()->getIterator());
5555 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5556
5557 // Add the barrier if requested.
5558 if (NeedsBarrier) {
5559 InsertPointOrErrorTy BarrierIP =
5561 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5562 /* CheckCancelFlag */ false);
5563 if (!BarrierIP)
5564 return BarrierIP.takeError();
5565 }
5566
5567 InsertPointTy AfterIP = CLI->getAfterIP();
5568 CLI->invalidate();
5569
5570 return AfterIP;
5571}
5572
5573static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5574 LoopInfo &LI);
5575static void addLoopMetadata(CanonicalLoopInfo *Loop,
5576 ArrayRef<Metadata *> Properties);
5577
5579 LLVMContext &Ctx, Loop *Loop,
5581 SmallVector<Metadata *> &LoopMDList) {
5582 SmallSet<BasicBlock *, 8> Reachable;
5583
5584 // Get the basic blocks from the loop in which memref instructions
5585 // can be found.
5586 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5587 // preferably without running any passes.
5588 for (BasicBlock *Block : Loop->getBlocks()) {
5589 if (Block == CLI->getCond() || Block == CLI->getHeader())
5590 continue;
5591 Reachable.insert(Block);
5592 }
5593
5594 // Add access group metadata to memory-access instructions.
5595 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5596 for (BasicBlock *BB : Reachable)
5597 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5598 // TODO: If the loop has existing parallel access metadata, have
5599 // to combine two lists.
5600 LoopMDList.push_back(MDNode::get(
5601 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5602}
5603
5605OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5606 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5607 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5608 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5609 assert(CLI->isValid() && "Requires a valid canonical loop");
5610 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5611
5612 LLVMContext &Ctx = CLI->getFunction()->getContext();
5613 Value *IV = CLI->getIndVar();
5614 Value *OrigTripCount = CLI->getTripCount();
5615 Type *IVTy = IV->getType();
5616 assert(IVTy->getIntegerBitWidth() <= 64 &&
5617 "Max supported tripcount bitwidth is 64 bits");
5618 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5619 : Type::getInt64Ty(Ctx);
5620 Type *I32Type = Type::getInt32Ty(M.getContext());
5621 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5622 Constant *One = ConstantInt::get(InternalIVTy, 1);
5623
5624 Function *F = CLI->getFunction();
5625 // Blocks must have terminators.
5626 // FIXME: Don't run analyses on incomplete/invalid IR.
5628 for (BasicBlock &BB : *F)
5629 if (!BB.hasTerminator())
5630 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5632 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5633 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5634 LoopAnalysis LIA;
5635 LoopInfo &&LI = LIA.run(*F, FAM);
5636 for (Instruction *I : UIs)
5637 I->eraseFromParent();
5638 Loop *L = LI.getLoopFor(CLI->getHeader());
5639 SmallVector<Metadata *> LoopMDList;
5640 if (ChunkSize || DistScheduleChunkSize)
5641 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5642 addLoopMetadata(CLI, LoopMDList);
5643
5644 // Declare useful OpenMP runtime functions.
5645 FunctionCallee StaticInit =
5646 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5647 FunctionCallee StaticFini =
5648 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5649
5650 // Allocate space for computed loop bounds as expected by the "init" function.
5651 Builder.restoreIP(AllocaIP);
5652 Builder.SetCurrentDebugLocation(DL);
5653 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5654 Value *PLowerBound =
5655 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5656 Value *PUpperBound =
5657 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5658 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5659 CLI->setLastIter(PLastIter);
5660
5661 // Set up the source location value for the OpenMP runtime.
5662 Builder.restoreIP(CLI->getPreheaderIP());
5663 Builder.SetCurrentDebugLocation(DL);
5664
5665 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5666 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5667 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5668 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5669 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5670 "distschedulechunksize");
5671 Value *CastedTripCount =
5672 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5673
5674 Constant *SchedulingType =
5675 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5676 Constant *DistSchedulingType =
5677 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5678 Builder.CreateStore(Zero, PLowerBound);
5679 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5680 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5681 Value *UpperBound =
5682 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5683 Builder.CreateStore(UpperBound, PUpperBound);
5684 Builder.CreateStore(One, PStride);
5685
5686 // Call the "init" function and update the trip count of the loop with the
5687 // value it produced.
5688 uint32_t SrcLocStrSize;
5689 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5690 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5691 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5692 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5693 PUpperBound, PStride, One,
5694 this](Value *SchedulingType, Value *ChunkSize,
5695 auto &Builder) {
5697 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5698 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5699 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5700 /*pstride=*/PStride, /*incr=*/One,
5701 /*chunk=*/ChunkSize});
5702 };
5703 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5704 if (DistScheduleSchedType != OMPScheduleType::None &&
5705 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5706 SchedType != OMPScheduleType::OrderedDistribute) {
5707 // We want to emit a second init function call for the dist_schedule clause
5708 // to the Distribute construct. This should only be done however if a
5709 // Workshare Loop is nested within a Distribute Construct
5710 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5711 }
5712
5713 // Load values written by the "init" function.
5714 Value *FirstChunkStart =
5715 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5716 Value *FirstChunkStop =
5717 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5718 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5719 Value *ChunkRange =
5720 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5721 Value *NextChunkStride =
5722 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5723
5724 // Create outer "dispatch" loop for enumerating the chunks.
5725 BasicBlock *DispatchEnter = splitBB(Builder, true);
5726 Value *DispatchCounter;
5727
5728 // It is safe to assume this didn't return an error because the callback
5729 // passed into createCanonicalLoop is the only possible error source, and it
5730 // always returns success.
5731 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5732 {Builder.saveIP(), DL},
5733 [&](InsertPointTy BodyIP, Value *Counter) {
5734 DispatchCounter = Counter;
5735 return Error::success();
5736 },
5737 FirstChunkStart, CastedTripCount, NextChunkStride,
5738 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5739 "dispatch"));
5740
5741 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5742 // not have to preserve the canonical invariant.
5743 BasicBlock *DispatchBody = DispatchCLI->getBody();
5744 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5745 BasicBlock *DispatchExit = DispatchCLI->getExit();
5746 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5747 DispatchCLI->invalidate();
5748
5749 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5750 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5751 redirectTo(CLI->getExit(), DispatchLatch, DL);
5752 redirectTo(DispatchBody, DispatchEnter, DL);
5753
5754 // Prepare the prolog of the chunk loop.
5755 Builder.restoreIP(CLI->getPreheaderIP());
5756 Builder.SetCurrentDebugLocation(DL);
5757
5758 // Compute the number of iterations of the chunk loop.
5759 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5760 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5761 Value *IsLastChunk =
5762 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5763 Value *CountUntilOrigTripCount =
5764 Builder.CreateSub(CastedTripCount, DispatchCounter);
5765 Value *ChunkTripCount = Builder.CreateSelect(
5766 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5767 Value *BackcastedChunkTC =
5768 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5769 CLI->setTripCount(BackcastedChunkTC);
5770
5771 // Update all uses of the induction variable except the one in the condition
5772 // block that compares it with the actual upper bound, and the increment in
5773 // the latch block.
5774 Value *BackcastedDispatchCounter =
5775 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5776 CLI->mapIndVar([&](Instruction *) -> Value * {
5777 Builder.restoreIP(CLI->getBodyIP());
5778 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5779 });
5780
5781 // In the "exit" block, call the "fini" function.
5782 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5783 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5784
5785 // Add the barrier if requested.
5786 if (NeedsBarrier) {
5787 InsertPointOrErrorTy AfterIP =
5788 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5789 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5790 if (!AfterIP)
5791 return AfterIP.takeError();
5792 }
5793
5794#ifndef NDEBUG
5795 // Even though we currently do not support applying additional methods to it,
5796 // the chunk loop should remain a canonical loop.
5797 CLI->assertOK();
5798#endif
5799
5800 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5801}
5802
5803// Returns an LLVM function to call for executing an OpenMP static worksharing
5804// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5805// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5806static FunctionCallee
5808 WorksharingLoopType LoopType) {
5809 unsigned Bitwidth = Ty->getIntegerBitWidth();
5810 Module &M = OMPBuilder->M;
5811 switch (LoopType) {
5812 case WorksharingLoopType::ForStaticLoop:
5813 if (Bitwidth == 32)
5814 return OMPBuilder->getOrCreateRuntimeFunction(
5815 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5816 if (Bitwidth == 64)
5817 return OMPBuilder->getOrCreateRuntimeFunction(
5818 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5819 break;
5820 case WorksharingLoopType::DistributeStaticLoop:
5821 if (Bitwidth == 32)
5822 return OMPBuilder->getOrCreateRuntimeFunction(
5823 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5824 if (Bitwidth == 64)
5825 return OMPBuilder->getOrCreateRuntimeFunction(
5826 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5827 break;
5828 case WorksharingLoopType::DistributeForStaticLoop:
5829 if (Bitwidth == 32)
5830 return OMPBuilder->getOrCreateRuntimeFunction(
5831 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5832 if (Bitwidth == 64)
5833 return OMPBuilder->getOrCreateRuntimeFunction(
5834 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5835 break;
5836 }
5837 if (Bitwidth != 32 && Bitwidth != 64) {
5838 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5839 }
5840 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5841}
5842
5843// Inserts a call to proper OpenMP Device RTL function which handles
5844// loop worksharing.
5846 WorksharingLoopType LoopType,
5847 BasicBlock *InsertBlock, Value *Ident,
5848 Value *LoopBodyArg, Value *TripCount,
5849 Function &LoopBodyFn, bool NoLoop) {
5850 Type *TripCountTy = TripCount->getType();
5851 Module &M = OMPBuilder->M;
5852 IRBuilder<> &Builder = OMPBuilder->Builder;
5853 FunctionCallee RTLFn =
5854 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5855 SmallVector<Value *, 8> RealArgs;
5856 RealArgs.push_back(Ident);
5857 RealArgs.push_back(&LoopBodyFn);
5858 RealArgs.push_back(LoopBodyArg);
5859 RealArgs.push_back(TripCount);
5860 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5861 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5862 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5863 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5864 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5865 return;
5866 }
5867 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5868 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5869 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5870 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5871
5872 RealArgs.push_back(
5873 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5874 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5875 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5876 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5877 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5878 } else {
5879 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5880 }
5881
5882 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5883}
5884
5886 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5887 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5888 WorksharingLoopType LoopType, bool NoLoop) {
5889 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5890 BasicBlock *Preheader = CLI->getPreheader();
5891 Value *TripCount = CLI->getTripCount();
5892
5893 // After loop body outling, the loop body contains only set up
5894 // of loop body argument structure and the call to the outlined
5895 // loop body function. Firstly, we need to move setup of loop body args
5896 // into loop preheader.
5897 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5898 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5899
5900 // The next step is to remove the whole loop. We do not it need anymore.
5901 // That's why make an unconditional branch from loop preheader to loop
5902 // exit block
5903 Builder.restoreIP({Preheader, Preheader->end()});
5904 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5905 Preheader->getTerminator()->eraseFromParent();
5906 Builder.CreateBr(CLI->getExit());
5907
5908 // Delete dead loop blocks
5909 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5910 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5911 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5912 CleanUpInfo.EntryBB = CLI->getHeader();
5913 CleanUpInfo.ExitBB = CLI->getExit();
5914 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5915 DeleteDeadBlocks(BlocksToBeRemoved);
5916
5917 // Find the instruction which corresponds to loop body argument structure
5918 // and remove the call to loop body function instruction.
5919 Value *LoopBodyArg;
5920 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5921 assert(OutlinedFnUser &&
5922 "Expected unique undroppable user of outlined function");
5923 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5924 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5925 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5926 "Expected outlined function call to be located in loop preheader");
5927 // Check in case no argument structure has been passed.
5928 if (OutlinedFnCallInstruction->arg_size() > 1)
5929 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5930 else
5931 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5932 OutlinedFnCallInstruction->eraseFromParent();
5933
5934 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5935 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5936
5937 for (auto &ToBeDeletedItem : ToBeDeleted)
5938 ToBeDeletedItem->eraseFromParent();
5939 CLI->invalidate();
5940}
5941
5942OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5943 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5944 WorksharingLoopType LoopType, bool NoLoop) {
5945 uint32_t SrcLocStrSize;
5946 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5948
5949 OutlineInfo OI;
5950 OI.OuterAllocaBB = CLI->getPreheader();
5951 Function *OuterFn = CLI->getPreheader()->getParent();
5952
5953 // Instructions which need to be deleted at the end of code generation
5954 SmallVector<Instruction *, 4> ToBeDeleted;
5955
5956 OI.OuterAllocaBB = AllocaIP.getBlock();
5957
5958 // Mark the body loop as region which needs to be extracted
5959 OI.EntryBB = CLI->getBody();
5960 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
5961 "omp.prelatch");
5962
5963 // Prepare loop body for extraction
5964 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5965
5966 // Insert new loop counter variable which will be used only in loop
5967 // body.
5968 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5969 Instruction *NewLoopCntLoad =
5970 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5971 // New loop counter instructions are redundant in the loop preheader when
5972 // code generation for workshare loop is finshed. That's why mark them as
5973 // ready for deletion.
5974 ToBeDeleted.push_back(NewLoopCntLoad);
5975 ToBeDeleted.push_back(NewLoopCnt);
5976
5977 // Analyse loop body region. Find all input variables which are used inside
5978 // loop body region.
5979 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5981 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5982
5983 CodeExtractorAnalysisCache CEAC(*OuterFn);
5984 CodeExtractor Extractor(Blocks,
5985 /* DominatorTree */ nullptr,
5986 /* AggregateArgs */ true,
5987 /* BlockFrequencyInfo */ nullptr,
5988 /* BranchProbabilityInfo */ nullptr,
5989 /* AssumptionCache */ nullptr,
5990 /* AllowVarArgs */ true,
5991 /* AllowAlloca */ true,
5992 /* AllocationBlock */ CLI->getPreheader(),
5993 /* Suffix */ ".omp_wsloop",
5994 /* AggrArgsIn0AddrSpace */ true);
5995
5996 BasicBlock *CommonExit = nullptr;
5997 SetVector<Value *> SinkingCands, HoistingCands;
5998
5999 // Find allocas outside the loop body region which are used inside loop
6000 // body
6001 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6002
6003 // We need to model loop body region as the function f(cnt, loop_arg).
6004 // That's why we replace loop induction variable by the new counter
6005 // which will be one of loop body function argument
6007 CLI->getIndVar()->user_end());
6008 for (auto Use : Users) {
6009 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6010 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6011 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6012 }
6013 }
6014 }
6015 // Make sure that loop counter variable is not merged into loop body
6016 // function argument structure and it is passed as separate variable
6017 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6018
6019 // PostOutline CB is invoked when loop body function is outlined and
6020 // loop body is replaced by call to outlined function. We need to add
6021 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6022 // function will handle loop control logic.
6023 //
6024 OI.PostOutlineCB = [=, ToBeDeletedVec =
6025 std::move(ToBeDeleted)](Function &OutlinedFn) {
6026 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6027 LoopType, NoLoop);
6028 };
6029 addOutlineInfo(std::move(OI));
6030 return CLI->getAfterIP();
6031}
6032
6035 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6036 bool HasSimdModifier, bool HasMonotonicModifier,
6037 bool HasNonmonotonicModifier, bool HasOrderedClause,
6038 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6039 Value *DistScheduleChunkSize) {
6040 if (Config.isTargetDevice())
6041 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6042 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6043 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6044 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6045
6046 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6047 OMPScheduleType::ModifierOrdered;
6048 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6049 if (HasDistSchedule) {
6050 DistScheduleSchedType = DistScheduleChunkSize
6051 ? OMPScheduleType::OrderedDistributeChunked
6052 : OMPScheduleType::OrderedDistribute;
6053 }
6054 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6055 case OMPScheduleType::BaseStatic:
6056 case OMPScheduleType::BaseDistribute:
6057 assert((!ChunkSize || !DistScheduleChunkSize) &&
6058 "No chunk size with static-chunked schedule");
6059 if (IsOrdered && !HasDistSchedule)
6060 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6061 NeedsBarrier, ChunkSize);
6062 // FIXME: Monotonicity ignored?
6063 if (DistScheduleChunkSize)
6064 return applyStaticChunkedWorkshareLoop(
6065 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6066 DistScheduleChunkSize, DistScheduleSchedType);
6067 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6068 HasDistSchedule);
6069
6070 case OMPScheduleType::BaseStaticChunked:
6071 case OMPScheduleType::BaseDistributeChunked:
6072 if (IsOrdered && !HasDistSchedule)
6073 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6074 NeedsBarrier, ChunkSize);
6075 // FIXME: Monotonicity ignored?
6076 return applyStaticChunkedWorkshareLoop(
6077 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6078 DistScheduleChunkSize, DistScheduleSchedType);
6079
6080 case OMPScheduleType::BaseRuntime:
6081 case OMPScheduleType::BaseAuto:
6082 case OMPScheduleType::BaseGreedy:
6083 case OMPScheduleType::BaseBalanced:
6084 case OMPScheduleType::BaseSteal:
6085 case OMPScheduleType::BaseRuntimeSimd:
6086 assert(!ChunkSize &&
6087 "schedule type does not support user-defined chunk sizes");
6088 [[fallthrough]];
6089 case OMPScheduleType::BaseGuidedSimd:
6090 case OMPScheduleType::BaseDynamicChunked:
6091 case OMPScheduleType::BaseGuidedChunked:
6092 case OMPScheduleType::BaseGuidedIterativeChunked:
6093 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6094 case OMPScheduleType::BaseStaticBalancedChunked:
6095 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6096 NeedsBarrier, ChunkSize);
6097
6098 default:
6099 llvm_unreachable("Unknown/unimplemented schedule kind");
6100 }
6101}
6102
6103/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6104/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6105/// the runtime. Always interpret integers as unsigned similarly to
6106/// CanonicalLoopInfo.
6107static FunctionCallee
6109 unsigned Bitwidth = Ty->getIntegerBitWidth();
6110 if (Bitwidth == 32)
6111 return OMPBuilder.getOrCreateRuntimeFunction(
6112 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6113 if (Bitwidth == 64)
6114 return OMPBuilder.getOrCreateRuntimeFunction(
6115 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6116 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6117}
6118
6119/// Returns an LLVM function to call for updating the next loop using OpenMP
6120/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6121/// the runtime. Always interpret integers as unsigned similarly to
6122/// CanonicalLoopInfo.
6123static FunctionCallee
6125 unsigned Bitwidth = Ty->getIntegerBitWidth();
6126 if (Bitwidth == 32)
6127 return OMPBuilder.getOrCreateRuntimeFunction(
6128 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6129 if (Bitwidth == 64)
6130 return OMPBuilder.getOrCreateRuntimeFunction(
6131 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6132 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6133}
6134
6135/// Returns an LLVM function to call for finalizing the dynamic loop using
6136/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6137/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6138static FunctionCallee
6140 unsigned Bitwidth = Ty->getIntegerBitWidth();
6141 if (Bitwidth == 32)
6142 return OMPBuilder.getOrCreateRuntimeFunction(
6143 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6144 if (Bitwidth == 64)
6145 return OMPBuilder.getOrCreateRuntimeFunction(
6146 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6147 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6148}
6149
6151OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6152 InsertPointTy AllocaIP,
6153 OMPScheduleType SchedType,
6154 bool NeedsBarrier, Value *Chunk) {
6155 assert(CLI->isValid() && "Requires a valid canonical loop");
6156 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6157 "Require dedicated allocate IP");
6159 "Require valid schedule type");
6160
6161 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6162 OMPScheduleType::ModifierOrdered;
6163
6164 // Set up the source location value for OpenMP runtime.
6165 Builder.SetCurrentDebugLocation(DL);
6166
6167 uint32_t SrcLocStrSize;
6168 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6169 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6170
6171 // Declare useful OpenMP runtime functions.
6172 Value *IV = CLI->getIndVar();
6173 Type *IVTy = IV->getType();
6174 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6175 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6176
6177 // Allocate space for computed loop bounds as expected by the "init" function.
6178 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6179 Type *I32Type = Type::getInt32Ty(M.getContext());
6180 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6181 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6182 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6183 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6184 CLI->setLastIter(PLastIter);
6185
6186 // At the end of the preheader, prepare for calling the "init" function by
6187 // storing the current loop bounds into the allocated space. A canonical loop
6188 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6189 // and produces an inclusive upper bound.
6190 BasicBlock *PreHeader = CLI->getPreheader();
6191 Builder.SetInsertPoint(PreHeader->getTerminator());
6192 Constant *One = ConstantInt::get(IVTy, 1);
6193 Builder.CreateStore(One, PLowerBound);
6194 Value *UpperBound = CLI->getTripCount();
6195 Builder.CreateStore(UpperBound, PUpperBound);
6196 Builder.CreateStore(One, PStride);
6197
6198 BasicBlock *Header = CLI->getHeader();
6199 BasicBlock *Exit = CLI->getExit();
6200 BasicBlock *Cond = CLI->getCond();
6201 BasicBlock *Latch = CLI->getLatch();
6202 InsertPointTy AfterIP = CLI->getAfterIP();
6203
6204 // The CLI will be "broken" in the code below, as the loop is no longer
6205 // a valid canonical loop.
6206
6207 if (!Chunk)
6208 Chunk = One;
6209
6210 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6211
6212 Constant *SchedulingType =
6213 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6214
6215 // Call the "init" function.
6216 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6217 /* LowerBound */ One, UpperBound,
6218 /* step */ One, Chunk});
6219
6220 // An outer loop around the existing one.
6221 BasicBlock *OuterCond = BasicBlock::Create(
6222 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6223 PreHeader->getParent());
6224 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6225 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6227 DynamicNext,
6228 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6229 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6230 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6231 Value *LowerBound =
6232 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6233 Builder.CreateCondBr(MoreWork, Header, Exit);
6234
6235 // Change PHI-node in loop header to use outer cond rather than preheader,
6236 // and set IV to the LowerBound.
6237 Instruction *Phi = &Header->front();
6238 auto *PI = cast<PHINode>(Phi);
6239 PI->setIncomingBlock(0, OuterCond);
6240 PI->setIncomingValue(0, LowerBound);
6241
6242 // Then set the pre-header to jump to the OuterCond
6243 Instruction *Term = PreHeader->getTerminator();
6244 auto *Br = cast<UncondBrInst>(Term);
6245 Br->setSuccessor(OuterCond);
6246
6247 // Modify the inner condition:
6248 // * Use the UpperBound returned from the DynamicNext call.
6249 // * jump to the loop outer loop when done with one of the inner loops.
6250 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6251 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6252 Instruction *Comp = &*Builder.GetInsertPoint();
6253 auto *CI = cast<CmpInst>(Comp);
6254 CI->setOperand(1, UpperBound);
6255 // Redirect the inner exit to branch to outer condition.
6256 Instruction *Branch = &Cond->back();
6257 auto *BI = cast<CondBrInst>(Branch);
6258 assert(BI->getSuccessor(1) == Exit);
6259 BI->setSuccessor(1, OuterCond);
6260
6261 // Call the "fini" function if "ordered" is present in wsloop directive.
6262 if (Ordered) {
6263 Builder.SetInsertPoint(&Latch->back());
6264 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6265 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6266 }
6267
6268 // Add the barrier if requested.
6269 if (NeedsBarrier) {
6270 Builder.SetInsertPoint(&Exit->back());
6271 InsertPointOrErrorTy BarrierIP =
6273 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6274 /* CheckCancelFlag */ false);
6275 if (!BarrierIP)
6276 return BarrierIP.takeError();
6277 }
6278
6279 CLI->invalidate();
6280 return AfterIP;
6281}
6282
6283/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6284/// after this \p OldTarget will be orphaned.
6286 BasicBlock *NewTarget, DebugLoc DL) {
6287 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6288 redirectTo(Pred, NewTarget, DL);
6289}
6290
6291/// Determine which blocks in \p BBs are reachable from outside and remove the
6292/// ones that are not reachable from the function.
6295 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6296 for (Use &U : BB->uses()) {
6297 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6298 if (!UseInst)
6299 continue;
6300 if (BBsToErase.count(UseInst->getParent()))
6301 continue;
6302 return true;
6303 }
6304 return false;
6305 };
6306
6307 while (BBsToErase.remove_if(HasRemainingUses)) {
6308 // Try again if anything was removed.
6309 }
6310
6311 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6312 DeleteDeadBlocks(BBVec);
6313}
6314
6315CanonicalLoopInfo *
6317 InsertPointTy ComputeIP) {
6318 assert(Loops.size() >= 1 && "At least one loop required");
6319 size_t NumLoops = Loops.size();
6320
6321 // Nothing to do if there is already just one loop.
6322 if (NumLoops == 1)
6323 return Loops.front();
6324
6325 CanonicalLoopInfo *Outermost = Loops.front();
6326 CanonicalLoopInfo *Innermost = Loops.back();
6327 BasicBlock *OrigPreheader = Outermost->getPreheader();
6328 BasicBlock *OrigAfter = Outermost->getAfter();
6329 Function *F = OrigPreheader->getParent();
6330
6331 // Loop control blocks that may become orphaned later.
6332 SmallVector<BasicBlock *, 12> OldControlBBs;
6333 OldControlBBs.reserve(6 * Loops.size());
6335 Loop->collectControlBlocks(OldControlBBs);
6336
6337 // Setup the IRBuilder for inserting the trip count computation.
6338 Builder.SetCurrentDebugLocation(DL);
6339 if (ComputeIP.isSet())
6340 Builder.restoreIP(ComputeIP);
6341 else
6342 Builder.restoreIP(Outermost->getPreheaderIP());
6343
6344 // Derive the collapsed' loop trip count.
6345 // TODO: Find common/largest indvar type.
6346 Value *CollapsedTripCount = nullptr;
6347 for (CanonicalLoopInfo *L : Loops) {
6348 assert(L->isValid() &&
6349 "All loops to collapse must be valid canonical loops");
6350 Value *OrigTripCount = L->getTripCount();
6351 if (!CollapsedTripCount) {
6352 CollapsedTripCount = OrigTripCount;
6353 continue;
6354 }
6355
6356 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6357 CollapsedTripCount =
6358 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6359 }
6360
6361 // Create the collapsed loop control flow.
6362 CanonicalLoopInfo *Result =
6363 createLoopSkeleton(DL, CollapsedTripCount, F,
6364 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6365
6366 // Build the collapsed loop body code.
6367 // Start with deriving the input loop induction variables from the collapsed
6368 // one, using a divmod scheme. To preserve the original loops' order, the
6369 // innermost loop use the least significant bits.
6370 Builder.restoreIP(Result->getBodyIP());
6371
6372 Value *Leftover = Result->getIndVar();
6373 SmallVector<Value *> NewIndVars;
6374 NewIndVars.resize(NumLoops);
6375 for (int i = NumLoops - 1; i >= 1; --i) {
6376 Value *OrigTripCount = Loops[i]->getTripCount();
6377
6378 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6379 NewIndVars[i] = NewIndVar;
6380
6381 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6382 }
6383 // Outermost loop gets all the remaining bits.
6384 NewIndVars[0] = Leftover;
6385
6386 // Construct the loop body control flow.
6387 // We progressively construct the branch structure following in direction of
6388 // the control flow, from the leading in-between code, the loop nest body, the
6389 // trailing in-between code, and rejoining the collapsed loop's latch.
6390 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6391 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6392 // its predecessors as sources.
6393 BasicBlock *ContinueBlock = Result->getBody();
6394 BasicBlock *ContinuePred = nullptr;
6395 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6396 BasicBlock *NextSrc) {
6397 if (ContinueBlock)
6398 redirectTo(ContinueBlock, Dest, DL);
6399 else
6400 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6401
6402 ContinueBlock = nullptr;
6403 ContinuePred = NextSrc;
6404 };
6405
6406 // The code before the nested loop of each level.
6407 // Because we are sinking it into the nest, it will be executed more often
6408 // that the original loop. More sophisticated schemes could keep track of what
6409 // the in-between code is and instantiate it only once per thread.
6410 for (size_t i = 0; i < NumLoops - 1; ++i)
6411 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6412
6413 // Connect the loop nest body.
6414 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6415
6416 // The code after the nested loop at each level.
6417 for (size_t i = NumLoops - 1; i > 0; --i)
6418 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6419
6420 // Connect the finished loop to the collapsed loop latch.
6421 ContinueWith(Result->getLatch(), nullptr);
6422
6423 // Replace the input loops with the new collapsed loop.
6424 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6425 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6426
6427 // Replace the input loop indvars with the derived ones.
6428 for (size_t i = 0; i < NumLoops; ++i)
6429 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6430
6431 // Remove unused parts of the input loops.
6432 removeUnusedBlocksFromParent(OldControlBBs);
6433
6434 for (CanonicalLoopInfo *L : Loops)
6435 L->invalidate();
6436
6437#ifndef NDEBUG
6438 Result->assertOK();
6439#endif
6440 return Result;
6441}
6442
6443std::vector<CanonicalLoopInfo *>
6445 ArrayRef<Value *> TileSizes) {
6446 assert(TileSizes.size() == Loops.size() &&
6447 "Must pass as many tile sizes as there are loops");
6448 int NumLoops = Loops.size();
6449 assert(NumLoops >= 1 && "At least one loop to tile required");
6450
6451 CanonicalLoopInfo *OutermostLoop = Loops.front();
6452 CanonicalLoopInfo *InnermostLoop = Loops.back();
6453 Function *F = OutermostLoop->getBody()->getParent();
6454 BasicBlock *InnerEnter = InnermostLoop->getBody();
6455 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6456
6457 // Loop control blocks that may become orphaned later.
6458 SmallVector<BasicBlock *, 12> OldControlBBs;
6459 OldControlBBs.reserve(6 * Loops.size());
6461 Loop->collectControlBlocks(OldControlBBs);
6462
6463 // Collect original trip counts and induction variable to be accessible by
6464 // index. Also, the structure of the original loops is not preserved during
6465 // the construction of the tiled loops, so do it before we scavenge the BBs of
6466 // any original CanonicalLoopInfo.
6467 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6468 for (CanonicalLoopInfo *L : Loops) {
6469 assert(L->isValid() && "All input loops must be valid canonical loops");
6470 OrigTripCounts.push_back(L->getTripCount());
6471 OrigIndVars.push_back(L->getIndVar());
6472 }
6473
6474 // Collect the code between loop headers. These may contain SSA definitions
6475 // that are used in the loop nest body. To be usable with in the innermost
6476 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6477 // these instructions may be executed more often than before the tiling.
6478 // TODO: It would be sufficient to only sink them into body of the
6479 // corresponding tile loop.
6481 for (int i = 0; i < NumLoops - 1; ++i) {
6482 CanonicalLoopInfo *Surrounding = Loops[i];
6483 CanonicalLoopInfo *Nested = Loops[i + 1];
6484
6485 BasicBlock *EnterBB = Surrounding->getBody();
6486 BasicBlock *ExitBB = Nested->getHeader();
6487 InbetweenCode.emplace_back(EnterBB, ExitBB);
6488 }
6489
6490 // Compute the trip counts of the floor loops.
6491 Builder.SetCurrentDebugLocation(DL);
6492 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6493 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6494 for (int i = 0; i < NumLoops; ++i) {
6495 Value *TileSize = TileSizes[i];
6496 Value *OrigTripCount = OrigTripCounts[i];
6497 Type *IVType = OrigTripCount->getType();
6498
6499 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6500 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6501
6502 // 0 if tripcount divides the tilesize, 1 otherwise.
6503 // 1 means we need an additional iteration for a partial tile.
6504 //
6505 // Unfortunately we cannot just use the roundup-formula
6506 // (tripcount + tilesize - 1)/tilesize
6507 // because the summation might overflow. We do not want introduce undefined
6508 // behavior when the untiled loop nest did not.
6509 Value *FloorTripOverflow =
6510 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6511
6512 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6513 Value *FloorTripCount =
6514 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6515 "omp_floor" + Twine(i) + ".tripcount", true);
6516
6517 // Remember some values for later use.
6518 FloorCompleteCount.push_back(FloorCompleteTripCount);
6519 FloorCount.push_back(FloorTripCount);
6520 FloorRems.push_back(FloorTripRem);
6521 }
6522
6523 // Generate the new loop nest, from the outermost to the innermost.
6524 std::vector<CanonicalLoopInfo *> Result;
6525 Result.reserve(NumLoops * 2);
6526
6527 // The basic block of the surrounding loop that enters the nest generated
6528 // loop.
6529 BasicBlock *Enter = OutermostLoop->getPreheader();
6530
6531 // The basic block of the surrounding loop where the inner code should
6532 // continue.
6533 BasicBlock *Continue = OutermostLoop->getAfter();
6534
6535 // Where the next loop basic block should be inserted.
6536 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6537
6538 auto EmbeddNewLoop =
6539 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6540 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6541 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6542 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6543 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6544 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6545
6546 // Setup the position where the next embedded loop connects to this loop.
6547 Enter = EmbeddedLoop->getBody();
6548 Continue = EmbeddedLoop->getLatch();
6549 OutroInsertBefore = EmbeddedLoop->getLatch();
6550 return EmbeddedLoop;
6551 };
6552
6553 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6554 const Twine &NameBase) {
6555 for (auto P : enumerate(TripCounts)) {
6556 CanonicalLoopInfo *EmbeddedLoop =
6557 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6558 Result.push_back(EmbeddedLoop);
6559 }
6560 };
6561
6562 EmbeddNewLoops(FloorCount, "floor");
6563
6564 // Within the innermost floor loop, emit the code that computes the tile
6565 // sizes.
6566 Builder.SetInsertPoint(Enter->getTerminator());
6567 SmallVector<Value *, 4> TileCounts;
6568 for (int i = 0; i < NumLoops; ++i) {
6569 CanonicalLoopInfo *FloorLoop = Result[i];
6570 Value *TileSize = TileSizes[i];
6571
6572 Value *FloorIsEpilogue =
6573 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6574 Value *TileTripCount =
6575 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6576
6577 TileCounts.push_back(TileTripCount);
6578 }
6579
6580 // Create the tile loops.
6581 EmbeddNewLoops(TileCounts, "tile");
6582
6583 // Insert the inbetween code into the body.
6584 BasicBlock *BodyEnter = Enter;
6585 BasicBlock *BodyEntered = nullptr;
6586 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6587 BasicBlock *EnterBB = P.first;
6588 BasicBlock *ExitBB = P.second;
6589
6590 if (BodyEnter)
6591 redirectTo(BodyEnter, EnterBB, DL);
6592 else
6593 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6594
6595 BodyEnter = nullptr;
6596 BodyEntered = ExitBB;
6597 }
6598
6599 // Append the original loop nest body into the generated loop nest body.
6600 if (BodyEnter)
6601 redirectTo(BodyEnter, InnerEnter, DL);
6602 else
6603 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6605
6606 // Replace the original induction variable with an induction variable computed
6607 // from the tile and floor induction variables.
6608 Builder.restoreIP(Result.back()->getBodyIP());
6609 for (int i = 0; i < NumLoops; ++i) {
6610 CanonicalLoopInfo *FloorLoop = Result[i];
6611 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6612 Value *OrigIndVar = OrigIndVars[i];
6613 Value *Size = TileSizes[i];
6614
6615 Value *Scale =
6616 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6617 Value *Shift =
6618 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6619 OrigIndVar->replaceAllUsesWith(Shift);
6620 }
6621
6622 // Remove unused parts of the original loops.
6623 removeUnusedBlocksFromParent(OldControlBBs);
6624
6625 for (CanonicalLoopInfo *L : Loops)
6626 L->invalidate();
6627
6628#ifndef NDEBUG
6629 for (CanonicalLoopInfo *GenL : Result)
6630 GenL->assertOK();
6631#endif
6632 return Result;
6633}
6634
6635/// Attach metadata \p Properties to the basic block described by \p BB. If the
6636/// basic block already has metadata, the basic block properties are appended.
6638 ArrayRef<Metadata *> Properties) {
6639 // Nothing to do if no property to attach.
6640 if (Properties.empty())
6641 return;
6642
6643 LLVMContext &Ctx = BB->getContext();
6644 SmallVector<Metadata *> NewProperties;
6645 NewProperties.push_back(nullptr);
6646
6647 // If the basic block already has metadata, prepend it to the new metadata.
6648 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6649 if (Existing)
6650 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6651
6652 append_range(NewProperties, Properties);
6653 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6654 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6655
6656 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6657}
6658
6659/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6660/// loop already has metadata, the loop properties are appended.
6662 ArrayRef<Metadata *> Properties) {
6663 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6664
6665 // Attach metadata to the loop's latch
6666 BasicBlock *Latch = Loop->getLatch();
6667 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6668 addBasicBlockMetadata(Latch, Properties);
6669}
6670
6671/// Attach llvm.access.group metadata to the memref instructions of \p Block
6673 LoopInfo &LI) {
6674 for (Instruction &I : *Block) {
6675 if (I.mayReadOrWriteMemory()) {
6676 // TODO: This instruction may already have access group from
6677 // other pragmas e.g. #pragma clang loop vectorize. Append
6678 // so that the existing metadata is not overwritten.
6679 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6680 }
6681 }
6682}
6683
6684CanonicalLoopInfo *
6686 CanonicalLoopInfo *firstLoop = Loops.front();
6687 CanonicalLoopInfo *lastLoop = Loops.back();
6688 Function *F = firstLoop->getPreheader()->getParent();
6689
6690 // Loop control blocks that will become orphaned later
6691 SmallVector<BasicBlock *> oldControlBBs;
6693 Loop->collectControlBlocks(oldControlBBs);
6694
6695 // Collect original trip counts
6696 SmallVector<Value *> origTripCounts;
6697 for (CanonicalLoopInfo *L : Loops) {
6698 assert(L->isValid() && "All input loops must be valid canonical loops");
6699 origTripCounts.push_back(L->getTripCount());
6700 }
6701
6702 Builder.SetCurrentDebugLocation(DL);
6703
6704 // Compute max trip count.
6705 // The fused loop will be from 0 to max(origTripCounts)
6706 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6707 F, firstLoop->getHeader());
6708 Builder.SetInsertPoint(TCBlock);
6709 Value *fusedTripCount = nullptr;
6710 for (CanonicalLoopInfo *L : Loops) {
6711 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6712 Value *origTripCount = L->getTripCount();
6713 if (!fusedTripCount) {
6714 fusedTripCount = origTripCount;
6715 continue;
6716 }
6717 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6718 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6719 ".omp.fuse.tc");
6720 }
6721
6722 // Generate new loop
6723 CanonicalLoopInfo *fused =
6724 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6725 lastLoop->getLatch(), "fused");
6726
6727 // Replace original loops with the fused loop
6728 // Preheader and After are not considered inside the CLI.
6729 // These are used to compute the individual TCs of the loops
6730 // so they have to be put before the resulting fused loop.
6731 // Moving them up for readability.
6732 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6733 Loops[i]->getPreheader()->moveBefore(TCBlock);
6734 Loops[i]->getAfter()->moveBefore(TCBlock);
6735 }
6736 lastLoop->getPreheader()->moveBefore(TCBlock);
6737
6738 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6739 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6740 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6741 }
6742 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6743 redirectTo(TCBlock, fused->getPreheader(), DL);
6744 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6745
6746 // Build the fused body
6747 // Create new Blocks with conditions that jump to the original loop bodies
6749 SmallVector<Value *> condValues;
6750 for (size_t i = 0; i < Loops.size(); ++i) {
6751 BasicBlock *condBlock = BasicBlock::Create(
6752 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6753 Builder.SetInsertPoint(condBlock);
6754 Value *condValue =
6755 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6756 condBBs.push_back(condBlock);
6757 condValues.push_back(condValue);
6758 }
6759 // Join the condition blocks with the bodies of the original loops
6760 redirectTo(fused->getBody(), condBBs[0], DL);
6761 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6762 Builder.SetInsertPoint(condBBs[i]);
6763 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6764 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6765 // Replace the IV with the fused IV
6766 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6767 }
6768 // Last body jumps to the created end body block
6769 Builder.SetInsertPoint(condBBs.back());
6770 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6771 fused->getLatch());
6772 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6773 // Replace the IV with the fused IV
6774 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6775
6776 // The loop latch must have only one predecessor. Currently it is branched to
6777 // from both the last condition block and the last loop body
6778 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6779 "omp.fused.pre_latch");
6780
6781 // Remove unused parts
6782 removeUnusedBlocksFromParent(oldControlBBs);
6783
6784 // Invalidate old CLIs
6785 for (CanonicalLoopInfo *L : Loops)
6786 L->invalidate();
6787
6788#ifndef NDEBUG
6789 fused->assertOK();
6790#endif
6791 return fused;
6792}
6793
6795 LLVMContext &Ctx = Builder.getContext();
6797 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6798 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6799}
6800
6802 LLVMContext &Ctx = Builder.getContext();
6804 Loop, {
6805 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6806 });
6807}
6808
6809void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6810 Value *IfCond, ValueToValueMapTy &VMap,
6811 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6812 const Twine &NamePrefix) {
6813 Function *F = CanonicalLoop->getFunction();
6814
6815 // We can't do
6816 // if (cond) {
6817 // simd_loop;
6818 // } else {
6819 // non_simd_loop;
6820 // }
6821 // because then the CanonicalLoopInfo would only point to one of the loops:
6822 // leading to other constructs operating on the same loop to malfunction.
6823 // Instead generate
6824 // while (...) {
6825 // if (cond) {
6826 // simd_body;
6827 // } else {
6828 // not_simd_body;
6829 // }
6830 // }
6831 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6832 // body at -O3
6833
6834 // Define where if branch should be inserted
6835 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6836
6837 // Create additional blocks for the if statement
6838 BasicBlock *Cond = SplitBeforeIt->getParent();
6839 llvm::LLVMContext &C = Cond->getContext();
6841 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6843 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6844
6845 // Create if condition branch.
6846 Builder.SetInsertPoint(SplitBeforeIt);
6847 Instruction *BrInstr =
6848 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6849 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6850 // Then block contains branch to omp loop body which needs to be vectorized
6851 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6852 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6853
6854 Builder.SetInsertPoint(ElseBlock);
6855
6856 // Clone loop for the else branch
6858
6859 SmallVector<BasicBlock *, 8> ExistingBlocks;
6860 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6861 ExistingBlocks.push_back(ThenBlock);
6862 ExistingBlocks.append(L->block_begin(), L->block_end());
6863 // Cond is the block that has the if clause condition
6864 // LoopCond is omp_loop.cond
6865 // LoopHeader is omp_loop.header
6866 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6867 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6868 assert(LoopCond && LoopHeader && "Invalid loop structure");
6869 for (BasicBlock *Block : ExistingBlocks) {
6870 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6871 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6872 continue;
6873 }
6874 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6875
6876 // fix name not to be omp.if.then
6877 if (Block == ThenBlock)
6878 NewBB->setName(NamePrefix + ".if.else");
6879
6880 NewBB->moveBefore(CanonicalLoop->getExit());
6881 VMap[Block] = NewBB;
6882 NewBlocks.push_back(NewBB);
6883 }
6884 remapInstructionsInBlocks(NewBlocks, VMap);
6885 Builder.CreateBr(NewBlocks.front());
6886
6887 // The loop latch must have only one predecessor. Currently it is branched to
6888 // from both the 'then' and 'else' branches.
6889 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6890 NamePrefix + ".pre_latch");
6891
6892 // Ensure that the then block is added to the loop so we add the attributes in
6893 // the next step
6894 L->addBasicBlockToLoop(ThenBlock, LI);
6895}
6896
6897unsigned
6899 const StringMap<bool> &Features) {
6900 if (TargetTriple.isX86()) {
6901 if (Features.lookup("avx512f"))
6902 return 512;
6903 else if (Features.lookup("avx"))
6904 return 256;
6905 return 128;
6906 }
6907 if (TargetTriple.isPPC())
6908 return 128;
6909 if (TargetTriple.isWasm())
6910 return 128;
6911 return 0;
6912}
6913
6915 MapVector<Value *, Value *> AlignedVars,
6916 Value *IfCond, OrderKind Order,
6917 ConstantInt *Simdlen, ConstantInt *Safelen) {
6918 LLVMContext &Ctx = Builder.getContext();
6919
6920 Function *F = CanonicalLoop->getFunction();
6921
6922 // Blocks must have terminators.
6923 // FIXME: Don't run analyses on incomplete/invalid IR.
6925 for (BasicBlock &BB : *F)
6926 if (!BB.hasTerminator())
6927 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
6928
6929 // TODO: We should not rely on pass manager. Currently we use pass manager
6930 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6931 // object. We should have a method which returns all blocks between
6932 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6934 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6935 FAM.registerPass([]() { return LoopAnalysis(); });
6936 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6937
6938 LoopAnalysis LIA;
6939 LoopInfo &&LI = LIA.run(*F, FAM);
6940
6941 for (Instruction *I : UIs)
6942 I->eraseFromParent();
6943
6944 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6945 if (AlignedVars.size()) {
6946 InsertPointTy IP = Builder.saveIP();
6947 for (auto &AlignedItem : AlignedVars) {
6948 Value *AlignedPtr = AlignedItem.first;
6949 Value *Alignment = AlignedItem.second;
6950 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6951 Builder.SetInsertPoint(loadInst->getNextNode());
6952 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6953 Alignment);
6954 }
6955 Builder.restoreIP(IP);
6956 }
6957
6958 if (IfCond) {
6959 ValueToValueMapTy VMap;
6960 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6961 }
6962
6964
6965 // Get the basic blocks from the loop in which memref instructions
6966 // can be found.
6967 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6968 // preferably without running any passes.
6969 for (BasicBlock *Block : L->getBlocks()) {
6970 if (Block == CanonicalLoop->getCond() ||
6971 Block == CanonicalLoop->getHeader())
6972 continue;
6973 Reachable.insert(Block);
6974 }
6975
6976 SmallVector<Metadata *> LoopMDList;
6977
6978 // In presence of finite 'safelen', it may be unsafe to mark all
6979 // the memory instructions parallel, because loop-carried
6980 // dependences of 'safelen' iterations are possible.
6981 // If clause order(concurrent) is specified then the memory instructions
6982 // are marked parallel even if 'safelen' is finite.
6983 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6984 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6985
6986 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6987 // versions so we can't add the loop attributes in that case.
6988 if (IfCond) {
6989 // we can still add llvm.loop.parallel_access
6990 addLoopMetadata(CanonicalLoop, LoopMDList);
6991 return;
6992 }
6993
6994 // Use the above access group metadata to create loop level
6995 // metadata, which should be distinct for each loop.
6996 ConstantAsMetadata *BoolConst =
6998 LoopMDList.push_back(MDNode::get(
6999 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7000
7001 if (Simdlen || Safelen) {
7002 // If both simdlen and safelen clauses are specified, the value of the
7003 // simdlen parameter must be less than or equal to the value of the safelen
7004 // parameter. Therefore, use safelen only in the absence of simdlen.
7005 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7006 LoopMDList.push_back(
7007 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7008 ConstantAsMetadata::get(VectorizeWidth)}));
7009 }
7010
7011 addLoopMetadata(CanonicalLoop, LoopMDList);
7012}
7013
7014/// Create the TargetMachine object to query the backend for optimization
7015/// preferences.
7016///
7017/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7018/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7019/// needed for the LLVM pass pipline. We use some default options to avoid
7020/// having to pass too many settings from the frontend that probably do not
7021/// matter.
7022///
7023/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7024/// method. If we are going to use TargetMachine for more purposes, especially
7025/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7026/// might become be worth requiring front-ends to pass on their TargetMachine,
7027/// or at least cache it between methods. Note that while fontends such as Clang
7028/// have just a single main TargetMachine per translation unit, "target-cpu" and
7029/// "target-features" that determine the TargetMachine are per-function and can
7030/// be overrided using __attribute__((target("OPTIONS"))).
7031static std::unique_ptr<TargetMachine>
7033 Module *M = F->getParent();
7034
7035 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7036 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7037 const llvm::Triple &Triple = M->getTargetTriple();
7038
7039 std::string Error;
7041 if (!TheTarget)
7042 return {};
7043
7045 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7046 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7047 /*CodeModel=*/std::nullopt, OptLevel));
7048}
7049
7050/// Heuristically determine the best-performant unroll factor for \p CLI. This
7051/// depends on the target processor. We are re-using the same heuristics as the
7052/// LoopUnrollPass.
7054 Function *F = CLI->getFunction();
7055
7056 // Assume the user requests the most aggressive unrolling, even if the rest of
7057 // the code is optimized using a lower setting.
7059 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7060
7061 // Blocks must have terminators.
7062 // FIXME: Don't run analyses on incomplete/invalid IR.
7064 for (BasicBlock &BB : *F)
7065 if (!BB.hasTerminator())
7066 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7067
7069 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7070 FAM.registerPass([]() { return AssumptionAnalysis(); });
7071 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7072 FAM.registerPass([]() { return LoopAnalysis(); });
7073 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7074 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7075 TargetIRAnalysis TIRA;
7076 if (TM)
7077 TIRA = TargetIRAnalysis(
7078 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7079 FAM.registerPass([&]() { return TIRA; });
7080
7081 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7083 ScalarEvolution &&SE = SEA.run(*F, FAM);
7085 DominatorTree &&DT = DTA.run(*F, FAM);
7086 LoopAnalysis LIA;
7087 LoopInfo &&LI = LIA.run(*F, FAM);
7089 AssumptionCache &&AC = ACT.run(*F, FAM);
7091
7092 for (Instruction *I : UIs)
7093 I->eraseFromParent();
7094
7095 Loop *L = LI.getLoopFor(CLI->getHeader());
7096 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7097
7099 L, SE, TTI,
7100 /*BlockFrequencyInfo=*/nullptr,
7101 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7102 /*UserThreshold=*/std::nullopt,
7103 /*UserCount=*/std::nullopt,
7104 /*UserAllowPartial=*/true,
7105 /*UserAllowRuntime=*/true,
7106 /*UserUpperBound=*/std::nullopt,
7107 /*UserFullUnrollMaxCount=*/std::nullopt);
7108
7109 UP.Force = true;
7110
7111 // Account for additional optimizations taking place before the LoopUnrollPass
7112 // would unroll the loop.
7115
7116 // Use normal unroll factors even if the rest of the code is optimized for
7117 // size.
7120
7121 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7122 << " Threshold=" << UP.Threshold << "\n"
7123 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7124 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7125 << " PartialOptSizeThreshold="
7126 << UP.PartialOptSizeThreshold << "\n");
7127
7128 // Disable peeling.
7131 /*UserAllowPeeling=*/false,
7132 /*UserAllowProfileBasedPeeling=*/false,
7133 /*UnrollingSpecficValues=*/false);
7134
7136 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7137
7138 // Assume that reads and writes to stack variables can be eliminated by
7139 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7140 // size.
7141 for (BasicBlock *BB : L->blocks()) {
7142 for (Instruction &I : *BB) {
7143 Value *Ptr;
7144 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7145 Ptr = Load->getPointerOperand();
7146 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7147 Ptr = Store->getPointerOperand();
7148 } else
7149 continue;
7150
7151 Ptr = Ptr->stripPointerCasts();
7152
7153 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7154 if (Alloca->getParent() == &F->getEntryBlock())
7155 EphValues.insert(&I);
7156 }
7157 }
7158 }
7159
7160 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7161
7162 // Loop is not unrollable if the loop contains certain instructions.
7163 if (!UCE.canUnroll()) {
7164 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7165 return 1;
7166 }
7167
7168 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7169 << "\n");
7170
7171 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7172 // be able to use it.
7173 int TripCount = 0;
7174 int MaxTripCount = 0;
7175 bool MaxOrZero = false;
7176 unsigned TripMultiple = 0;
7177
7178 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7179 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7180 unsigned Factor = UP.Count;
7181 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7182
7183 // This function returns 1 to signal to not unroll a loop.
7184 if (Factor == 0)
7185 return 1;
7186 return Factor;
7187}
7188
7190 int32_t Factor,
7191 CanonicalLoopInfo **UnrolledCLI) {
7192 assert(Factor >= 0 && "Unroll factor must not be negative");
7193
7194 Function *F = Loop->getFunction();
7195 LLVMContext &Ctx = F->getContext();
7196
7197 // If the unrolled loop is not used for another loop-associated directive, it
7198 // is sufficient to add metadata for the LoopUnrollPass.
7199 if (!UnrolledCLI) {
7200 SmallVector<Metadata *, 2> LoopMetadata;
7201 LoopMetadata.push_back(
7202 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7203
7204 if (Factor >= 1) {
7206 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7207 LoopMetadata.push_back(MDNode::get(
7208 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7209 }
7210
7211 addLoopMetadata(Loop, LoopMetadata);
7212 return;
7213 }
7214
7215 // Heuristically determine the unroll factor.
7216 if (Factor == 0)
7218
7219 // No change required with unroll factor 1.
7220 if (Factor == 1) {
7221 *UnrolledCLI = Loop;
7222 return;
7223 }
7224
7225 assert(Factor >= 2 &&
7226 "unrolling only makes sense with a factor of 2 or larger");
7227
7228 Type *IndVarTy = Loop->getIndVarType();
7229
7230 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7231 // unroll the inner loop.
7232 Value *FactorVal =
7233 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7234 /*isSigned=*/false));
7235 std::vector<CanonicalLoopInfo *> LoopNest =
7236 tileLoops(DL, {Loop}, {FactorVal});
7237 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7238 *UnrolledCLI = LoopNest[0];
7239 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7240
7241 // LoopUnrollPass can only fully unroll loops with constant trip count.
7242 // Unroll by the unroll factor with a fallback epilog for the remainder
7243 // iterations if necessary.
7245 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7247 InnerLoop,
7248 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7250 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7251
7252#ifndef NDEBUG
7253 (*UnrolledCLI)->assertOK();
7254#endif
7255}
7256
7259 llvm::Value *BufSize, llvm::Value *CpyBuf,
7260 llvm::Value *CpyFn, llvm::Value *DidIt) {
7261 if (!updateToLocation(Loc))
7262 return Loc.IP;
7263
7264 uint32_t SrcLocStrSize;
7265 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7266 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7267 Value *ThreadId = getOrCreateThreadID(Ident);
7268
7269 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7270
7271 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7272
7273 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7274 createRuntimeFunctionCall(Fn, Args);
7275
7276 return Builder.saveIP();
7277}
7278
7280 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7281 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7283
7284 if (!updateToLocation(Loc))
7285 return Loc.IP;
7286
7287 // If needed allocate and initialize `DidIt` with 0.
7288 // DidIt: flag variable: 1=single thread; 0=not single thread.
7289 llvm::Value *DidIt = nullptr;
7290 if (!CPVars.empty()) {
7291 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7292 Builder.CreateStore(Builder.getInt32(0), DidIt);
7293 }
7294
7295 Directive OMPD = Directive::OMPD_single;
7296 uint32_t SrcLocStrSize;
7297 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7298 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7299 Value *ThreadId = getOrCreateThreadID(Ident);
7300 Value *Args[] = {Ident, ThreadId};
7301
7302 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7303 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7304
7305 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7306 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7307
7308 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7309 if (Error Err = FiniCB(IP))
7310 return Err;
7311
7312 // The thread that executes the single region must set `DidIt` to 1.
7313 // This is used by __kmpc_copyprivate, to know if the caller is the
7314 // single thread or not.
7315 if (DidIt)
7316 Builder.CreateStore(Builder.getInt32(1), DidIt);
7317
7318 return Error::success();
7319 };
7320
7321 // generates the following:
7322 // if (__kmpc_single()) {
7323 // .... single region ...
7324 // __kmpc_end_single
7325 // }
7326 // __kmpc_copyprivate
7327 // __kmpc_barrier
7328
7329 InsertPointOrErrorTy AfterIP =
7330 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7331 /*Conditional*/ true,
7332 /*hasFinalize*/ true);
7333 if (!AfterIP)
7334 return AfterIP.takeError();
7335
7336 if (DidIt) {
7337 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7338 // NOTE BufSize is currently unused, so just pass 0.
7340 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7341 CPFuncs[I], DidIt);
7342 // NOTE __kmpc_copyprivate already inserts a barrier
7343 } else if (!IsNowait) {
7344 InsertPointOrErrorTy AfterIP =
7346 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7347 /* CheckCancelFlag */ false);
7348 if (!AfterIP)
7349 return AfterIP.takeError();
7350 }
7351 return Builder.saveIP();
7352}
7353
7355 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7356 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7357
7358 if (!updateToLocation(Loc))
7359 return Loc.IP;
7360
7361 Directive OMPD = Directive::OMPD_critical;
7362 uint32_t SrcLocStrSize;
7363 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7364 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7365 Value *ThreadId = getOrCreateThreadID(Ident);
7366 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7367 Value *Args[] = {Ident, ThreadId, LockVar};
7368
7369 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7370 Function *RTFn = nullptr;
7371 if (HintInst) {
7372 // Add Hint to entry Args and create call
7373 EnterArgs.push_back(HintInst);
7374 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7375 } else {
7376 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7377 }
7378 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7379
7380 Function *ExitRTLFn =
7381 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7382 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7383
7384 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7385 /*Conditional*/ false, /*hasFinalize*/ true);
7386}
7387
7390 InsertPointTy AllocaIP, unsigned NumLoops,
7391 ArrayRef<llvm::Value *> StoreValues,
7392 const Twine &Name, bool IsDependSource) {
7393 assert(
7394 llvm::all_of(StoreValues,
7395 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7396 "OpenMP runtime requires depend vec with i64 type");
7397
7398 if (!updateToLocation(Loc))
7399 return Loc.IP;
7400
7401 // Allocate space for vector and generate alloc instruction.
7402 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7403 Builder.restoreIP(AllocaIP);
7404 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7405 ArgsBase->setAlignment(Align(8));
7407
7408 // Store the index value with offset in depend vector.
7409 for (unsigned I = 0; I < NumLoops; ++I) {
7410 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7411 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7412 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7413 STInst->setAlignment(Align(8));
7414 }
7415
7416 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7417 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7418
7419 uint32_t SrcLocStrSize;
7420 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7421 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7422 Value *ThreadId = getOrCreateThreadID(Ident);
7423 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7424
7425 Function *RTLFn = nullptr;
7426 if (IsDependSource)
7427 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7428 else
7429 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7430 createRuntimeFunctionCall(RTLFn, Args);
7431
7432 return Builder.saveIP();
7433}
7434
7436 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7437 FinalizeCallbackTy FiniCB, bool IsThreads) {
7438 if (!updateToLocation(Loc))
7439 return Loc.IP;
7440
7441 Directive OMPD = Directive::OMPD_ordered;
7442 Instruction *EntryCall = nullptr;
7443 Instruction *ExitCall = nullptr;
7444
7445 if (IsThreads) {
7446 uint32_t SrcLocStrSize;
7447 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7448 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7449 Value *ThreadId = getOrCreateThreadID(Ident);
7450 Value *Args[] = {Ident, ThreadId};
7451
7452 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7453 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7454
7455 Function *ExitRTLFn =
7456 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7457 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7458 }
7459
7460 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7461 /*Conditional*/ false, /*hasFinalize*/ true);
7462}
7463
7464OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7465 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7466 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7467 bool HasFinalize, bool IsCancellable) {
7468
7469 if (HasFinalize)
7470 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7471
7472 // Create inlined region's entry and body blocks, in preparation
7473 // for conditional creation
7474 BasicBlock *EntryBB = Builder.GetInsertBlock();
7475 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7477 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7478 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7479 BasicBlock *FiniBB =
7480 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7481
7482 Builder.SetInsertPoint(EntryBB->getTerminator());
7483 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7484
7485 // generate body
7486 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7487 /* CodeGenIP */ Builder.saveIP()))
7488 return Err;
7489
7490 // emit exit call and do any needed finalization.
7491 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7492 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7493 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7494 "Unexpected control flow graph state!!");
7495 InsertPointOrErrorTy AfterIP =
7496 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7497 if (!AfterIP)
7498 return AfterIP.takeError();
7499
7500 // If we are skipping the region of a non conditional, remove the exit
7501 // block, and clear the builder's insertion point.
7502 assert(SplitPos->getParent() == ExitBB &&
7503 "Unexpected Insertion point location!");
7504 auto merged = MergeBlockIntoPredecessor(ExitBB);
7505 BasicBlock *ExitPredBB = SplitPos->getParent();
7506 auto InsertBB = merged ? ExitPredBB : ExitBB;
7508 SplitPos->eraseFromParent();
7509 Builder.SetInsertPoint(InsertBB);
7510
7511 return Builder.saveIP();
7512}
7513
7514OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7515 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7516 // if nothing to do, Return current insertion point.
7517 if (!Conditional || !EntryCall)
7518 return Builder.saveIP();
7519
7520 BasicBlock *EntryBB = Builder.GetInsertBlock();
7521 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7522 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7523 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7524
7525 // Emit thenBB and set the Builder's insertion point there for
7526 // body generation next. Place the block after the current block.
7527 Function *CurFn = EntryBB->getParent();
7528 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7529
7530 // Move Entry branch to end of ThenBB, and replace with conditional
7531 // branch (If-stmt)
7532 Instruction *EntryBBTI = EntryBB->getTerminator();
7533 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7534 EntryBBTI->removeFromParent();
7535 Builder.SetInsertPoint(UI);
7536 Builder.Insert(EntryBBTI);
7537 UI->eraseFromParent();
7538 Builder.SetInsertPoint(ThenBB->getTerminator());
7539
7540 // return an insertion point to ExitBB.
7541 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7542}
7543
7544OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7545 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7546 bool HasFinalize) {
7547
7548 Builder.restoreIP(FinIP);
7549
7550 // If there is finalization to do, emit it before the exit call
7551 if (HasFinalize) {
7552 assert(!FinalizationStack.empty() &&
7553 "Unexpected finalization stack state!");
7554
7555 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7556 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7557
7558 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7559 return std::move(Err);
7560
7561 // Exit condition: insertion point is before the terminator of the new Fini
7562 // block
7563 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7564 }
7565
7566 if (!ExitCall)
7567 return Builder.saveIP();
7568
7569 // place the Exitcall as last instruction before Finalization block terminator
7570 ExitCall->removeFromParent();
7571 Builder.Insert(ExitCall);
7572
7573 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7574 ExitCall->getIterator());
7575}
7576
7578 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7579 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7580 if (!IP.isSet())
7581 return IP;
7582
7584
7585 // creates the following CFG structure
7586 // OMP_Entry : (MasterAddr != PrivateAddr)?
7587 // F T
7588 // | \
7589 // | copin.not.master
7590 // | /
7591 // v /
7592 // copyin.not.master.end
7593 // |
7594 // v
7595 // OMP.Entry.Next
7596
7597 BasicBlock *OMP_Entry = IP.getBlock();
7598 Function *CurFn = OMP_Entry->getParent();
7599 BasicBlock *CopyBegin =
7600 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7601 BasicBlock *CopyEnd = nullptr;
7602
7603 // If entry block is terminated, split to preserve the branch to following
7604 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7606 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7607 "copyin.not.master.end");
7608 OMP_Entry->getTerminator()->eraseFromParent();
7609 } else {
7610 CopyEnd =
7611 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7612 }
7613
7614 Builder.SetInsertPoint(OMP_Entry);
7615 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7616 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7617 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7618 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7619
7620 Builder.SetInsertPoint(CopyBegin);
7621 if (BranchtoEnd)
7622 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7623
7624 return Builder.saveIP();
7625}
7626
7628 Value *Size, Value *Allocator,
7629 std::string Name) {
7632
7633 uint32_t SrcLocStrSize;
7634 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7635 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7636 Value *ThreadId = getOrCreateThreadID(Ident);
7637 Value *Args[] = {ThreadId, Size, Allocator};
7638
7639 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7640
7641 return createRuntimeFunctionCall(Fn, Args, Name);
7642}
7643
7645 Value *Addr, Value *Allocator,
7646 std::string Name) {
7649
7650 uint32_t SrcLocStrSize;
7651 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7652 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7653 Value *ThreadId = getOrCreateThreadID(Ident);
7654 Value *Args[] = {ThreadId, Addr, Allocator};
7655 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7656 return createRuntimeFunctionCall(Fn, Args, Name);
7657}
7658
7660 const LocationDescription &Loc, Value *InteropVar,
7661 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7662 Value *DependenceAddress, bool HaveNowaitClause) {
7665
7666 uint32_t SrcLocStrSize;
7667 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7668 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7669 Value *ThreadId = getOrCreateThreadID(Ident);
7670 if (Device == nullptr)
7671 Device = Constant::getAllOnesValue(Int32);
7672 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7673 if (NumDependences == nullptr) {
7674 NumDependences = ConstantInt::get(Int32, 0);
7675 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7676 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7677 }
7678 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7679 Value *Args[] = {
7680 Ident, ThreadId, InteropVar, InteropTypeVal,
7681 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7682
7683 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7684
7685 return createRuntimeFunctionCall(Fn, Args);
7686}
7687
7689 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7690 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7693
7694 uint32_t SrcLocStrSize;
7695 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7696 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7697 Value *ThreadId = getOrCreateThreadID(Ident);
7698 if (Device == nullptr)
7699 Device = Constant::getAllOnesValue(Int32);
7700 if (NumDependences == nullptr) {
7701 NumDependences = ConstantInt::get(Int32, 0);
7702 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7703 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7704 }
7705 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7706 Value *Args[] = {
7707 Ident, ThreadId, InteropVar, Device,
7708 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7709
7710 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7711
7712 return createRuntimeFunctionCall(Fn, Args);
7713}
7714
7716 Value *InteropVar, Value *Device,
7717 Value *NumDependences,
7718 Value *DependenceAddress,
7719 bool HaveNowaitClause) {
7722 uint32_t SrcLocStrSize;
7723 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7724 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7725 Value *ThreadId = getOrCreateThreadID(Ident);
7726 if (Device == nullptr)
7727 Device = Constant::getAllOnesValue(Int32);
7728 if (NumDependences == nullptr) {
7729 NumDependences = ConstantInt::get(Int32, 0);
7730 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7731 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7732 }
7733 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7734 Value *Args[] = {
7735 Ident, ThreadId, InteropVar, Device,
7736 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7737
7738 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7739
7740 return createRuntimeFunctionCall(Fn, Args);
7741}
7742
7745 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7748
7749 uint32_t SrcLocStrSize;
7750 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7751 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7752 Value *ThreadId = getOrCreateThreadID(Ident);
7753 Constant *ThreadPrivateCache =
7754 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7755 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7756
7757 Function *Fn =
7758 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7759
7760 return createRuntimeFunctionCall(Fn, Args);
7761}
7762
7764 const LocationDescription &Loc,
7766 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7767 "expected num_threads and num_teams to be specified");
7768
7769 if (!updateToLocation(Loc))
7770 return Loc.IP;
7771
7772 uint32_t SrcLocStrSize;
7773 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7774 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7775 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7776 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7777 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7778 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7779 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7780
7781 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7782 Function *Kernel = DebugKernelWrapper;
7783
7784 // We need to strip the debug prefix to get the correct kernel name.
7785 StringRef KernelName = Kernel->getName();
7786 const std::string DebugPrefix = "_debug__";
7787 if (KernelName.ends_with(DebugPrefix)) {
7788 KernelName = KernelName.drop_back(DebugPrefix.length());
7789 Kernel = M.getFunction(KernelName);
7790 assert(Kernel && "Expected the real kernel to exist");
7791 }
7792
7793 // Manifest the launch configuration in the metadata matching the kernel
7794 // environment.
7795 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7796 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7797
7798 // If MaxThreads not set, select the maximum between the default workgroup
7799 // size and the MinThreads value.
7800 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7801 if (MaxThreadsVal < 0) {
7802 if (hasGridValue(T)) {
7803 MaxThreadsVal =
7804 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
7805 Attrs.MinThreads);
7806 } else {
7807 MaxThreadsVal = Attrs.MinThreads;
7808 }
7809 }
7810
7811 if (MaxThreadsVal > 0)
7812 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7813
7814 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7815 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7816 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7817 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7818 Constant *ReductionDataSize =
7819 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7820 Constant *ReductionBufferLength =
7821 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7822
7824 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7825 const DataLayout &DL = Fn->getDataLayout();
7826
7827 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7828 Constant *DynamicEnvironmentInitializer =
7829 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7830 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7831 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7832 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7833 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7834 DL.getDefaultGlobalsAddressSpace());
7835 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7836
7837 Constant *DynamicEnvironment =
7838 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7839 ? DynamicEnvironmentGV
7840 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7841 DynamicEnvironmentPtr);
7842
7843 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7844 ConfigurationEnvironment, {
7845 UseGenericStateMachineVal,
7846 MayUseNestedParallelismVal,
7847 IsSPMDVal,
7848 MinThreads,
7849 MaxThreads,
7850 MinTeams,
7851 MaxTeams,
7852 ReductionDataSize,
7853 ReductionBufferLength,
7854 });
7855 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7856 KernelEnvironment, {
7857 ConfigurationEnvironmentInitializer,
7858 Ident,
7859 DynamicEnvironment,
7860 });
7861 std::string KernelEnvironmentName =
7862 (KernelName + "_kernel_environment").str();
7863 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7864 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7865 KernelEnvironmentInitializer, KernelEnvironmentName,
7866 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7867 DL.getDefaultGlobalsAddressSpace());
7868 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7869
7870 Constant *KernelEnvironment =
7871 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7872 ? KernelEnvironmentGV
7873 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7874 KernelEnvironmentPtr);
7875 Value *KernelLaunchEnvironment =
7876 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
7877 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7878 KernelLaunchEnvironment =
7879 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7880 ? KernelLaunchEnvironment
7881 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7882 KernelLaunchEnvParamTy);
7883 CallInst *ThreadKind = createRuntimeFunctionCall(
7884 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7885
7886 Value *ExecUserCode = Builder.CreateICmpEQ(
7887 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7888 "exec_user_code");
7889
7890 // ThreadKind = __kmpc_target_init(...)
7891 // if (ThreadKind == -1)
7892 // user_code
7893 // else
7894 // return;
7895
7896 auto *UI = Builder.CreateUnreachable();
7897 BasicBlock *CheckBB = UI->getParent();
7898 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7899
7900 BasicBlock *WorkerExitBB = BasicBlock::Create(
7901 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7902 Builder.SetInsertPoint(WorkerExitBB);
7903 Builder.CreateRetVoid();
7904
7905 auto *CheckBBTI = CheckBB->getTerminator();
7906 Builder.SetInsertPoint(CheckBBTI);
7907 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7908
7909 CheckBBTI->eraseFromParent();
7910 UI->eraseFromParent();
7911
7912 // Continue in the "user_code" block, see diagram above and in
7913 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7914 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7915}
7916
7918 int32_t TeamsReductionDataSize,
7919 int32_t TeamsReductionBufferLength) {
7920 if (!updateToLocation(Loc))
7921 return;
7922
7924 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7925
7927
7928 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7929 return;
7930
7931 Function *Kernel = Builder.GetInsertBlock()->getParent();
7932 // We need to strip the debug prefix to get the correct kernel name.
7933 StringRef KernelName = Kernel->getName();
7934 const std::string DebugPrefix = "_debug__";
7935 if (KernelName.ends_with(DebugPrefix))
7936 KernelName = KernelName.drop_back(DebugPrefix.length());
7937 auto *KernelEnvironmentGV =
7938 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7939 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7940 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7941 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7942 KernelEnvironmentInitializer,
7943 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7944 NewInitializer = ConstantFoldInsertValueInstruction(
7945 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7946 {0, 8});
7947 KernelEnvironmentGV->setInitializer(NewInitializer);
7948}
7949
7950static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7951 bool Min) {
7952 if (Kernel.hasFnAttribute(Name)) {
7953 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7954 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7955 }
7956 Kernel.addFnAttr(Name, llvm::utostr(Value));
7957}
7958
7959std::pair<int32_t, int32_t>
7961 int32_t ThreadLimit =
7962 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7963
7964 if (T.isAMDGPU()) {
7965 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7966 if (!Attr.isValid() || !Attr.isStringAttribute())
7967 return {0, ThreadLimit};
7968 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7969 int32_t LB, UB;
7970 if (!llvm::to_integer(UBStr, UB, 10))
7971 return {0, ThreadLimit};
7972 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7973 if (!llvm::to_integer(LBStr, LB, 10))
7974 return {0, UB};
7975 return {LB, UB};
7976 }
7977
7978 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
7979 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
7980 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7981 }
7982 return {0, ThreadLimit};
7983}
7984
7986 Function &Kernel, int32_t LB,
7987 int32_t UB) {
7988 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7989
7990 if (T.isAMDGPU()) {
7991 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7992 llvm::utostr(LB) + "," + llvm::utostr(UB));
7993 return;
7994 }
7995
7997}
7998
7999std::pair<int32_t, int32_t>
8001 // TODO: Read from backend annotations if available.
8002 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8003}
8004
8006 int32_t LB, int32_t UB) {
8007 if (T.isNVPTX())
8008 if (UB > 0)
8010 if (T.isAMDGPU())
8011 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
8012
8013 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8014}
8015
8016void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8017 Function *OutlinedFn) {
8018 if (Config.isTargetDevice()) {
8020 // TODO: Determine if DSO local can be set to true.
8021 OutlinedFn->setDSOLocal(false);
8023 if (T.isAMDGCN())
8025 else if (T.isNVPTX())
8027 else if (T.isSPIRV())
8029 }
8030}
8031
8032Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8033 StringRef EntryFnIDName) {
8034 if (Config.isTargetDevice()) {
8035 assert(OutlinedFn && "The outlined function must exist if embedded");
8036 return OutlinedFn;
8037 }
8038
8039 return new GlobalVariable(
8040 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8041 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8042}
8043
8044Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8045 StringRef EntryFnName) {
8046 if (OutlinedFn)
8047 return OutlinedFn;
8048
8049 assert(!M.getGlobalVariable(EntryFnName, true) &&
8050 "Named kernel already exists?");
8051 return new GlobalVariable(
8052 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8053 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8054}
8055
8057 TargetRegionEntryInfo &EntryInfo,
8058 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8059 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8060
8061 SmallString<64> EntryFnName;
8062 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8063
8064 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8065 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8066 if (!CBResult)
8067 return CBResult.takeError();
8068 OutlinedFn = *CBResult;
8069 } else {
8070 OutlinedFn = nullptr;
8071 }
8072
8073 // If this target outline function is not an offload entry, we don't need to
8074 // register it. This may be in the case of a false if clause, or if there are
8075 // no OpenMP targets.
8076 if (!IsOffloadEntry)
8077 return Error::success();
8078
8079 std::string EntryFnIDName =
8080 Config.isTargetDevice()
8081 ? std::string(EntryFnName)
8082 : createPlatformSpecificName({EntryFnName, "region_id"});
8083
8084 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8085 EntryFnName, EntryFnIDName);
8086 return Error::success();
8087}
8088
8090 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8091 StringRef EntryFnName, StringRef EntryFnIDName) {
8092 if (OutlinedFn)
8093 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8094 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8095 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8096 OffloadInfoManager.registerTargetRegionEntryInfo(
8097 EntryInfo, EntryAddr, OutlinedFnID,
8099 return OutlinedFnID;
8100}
8101
8103 const LocationDescription &Loc, InsertPointTy AllocaIP,
8104 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8105 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8106 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8108 BodyGenTy BodyGenType)>
8109 BodyGenCB,
8110 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8111 if (!updateToLocation(Loc))
8112 return InsertPointTy();
8113
8114 Builder.restoreIP(CodeGenIP);
8115
8116 bool IsStandAlone = !BodyGenCB;
8117 MapInfosTy *MapInfo;
8118 // Generate the code for the opening of the data environment. Capture all the
8119 // arguments of the runtime call by reference because they are used in the
8120 // closing of the region.
8121 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8122 InsertPointTy CodeGenIP) -> Error {
8123 MapInfo = &GenMapInfoCB(Builder.saveIP());
8124 if (Error Err = emitOffloadingArrays(
8125 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8126 /*IsNonContiguous=*/true, DeviceAddrCB))
8127 return Err;
8128
8129 TargetDataRTArgs RTArgs;
8131
8132 // Emit the number of elements in the offloading arrays.
8133 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8134
8135 // Source location for the ident struct
8136 if (!SrcLocInfo) {
8137 uint32_t SrcLocStrSize;
8138 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8139 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8140 }
8141
8142 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8143 SrcLocInfo, DeviceID,
8144 PointerNum, RTArgs.BasePointersArray,
8145 RTArgs.PointersArray, RTArgs.SizesArray,
8146 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8147 RTArgs.MappersArray};
8148
8149 if (IsStandAlone) {
8150 assert(MapperFunc && "MapperFunc missing for standalone target data");
8151
8152 auto TaskBodyCB = [&](Value *, Value *,
8154 if (Info.HasNoWait) {
8155 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8159 }
8160
8162 OffloadingArgs);
8163
8164 if (Info.HasNoWait) {
8165 BasicBlock *OffloadContBlock =
8166 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8167 Function *CurFn = Builder.GetInsertBlock()->getParent();
8168 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8169 Builder.restoreIP(Builder.saveIP());
8170 }
8171 return Error::success();
8172 };
8173
8174 bool RequiresOuterTargetTask = Info.HasNoWait;
8175 if (!RequiresOuterTargetTask)
8176 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8177 /*TargetTaskAllocaIP=*/{}));
8178 else
8179 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8180 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8181 } else {
8182 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8183 omp::OMPRTL___tgt_target_data_begin_mapper);
8184
8185 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8186
8187 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8188 if (isa<AllocaInst>(DeviceMap.second.second)) {
8189 auto *LI =
8190 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8191 Builder.CreateStore(LI, DeviceMap.second.second);
8192 }
8193 }
8194
8195 // If device pointer privatization is required, emit the body of the
8196 // region here. It will have to be duplicated: with and without
8197 // privatization.
8198 InsertPointOrErrorTy AfterIP =
8199 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8200 if (!AfterIP)
8201 return AfterIP.takeError();
8202 Builder.restoreIP(*AfterIP);
8203 }
8204 return Error::success();
8205 };
8206
8207 // If we need device pointer privatization, we need to emit the body of the
8208 // region with no privatization in the 'else' branch of the conditional.
8209 // Otherwise, we don't have to do anything.
8210 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8211 InsertPointTy CodeGenIP) -> Error {
8212 InsertPointOrErrorTy AfterIP =
8213 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8214 if (!AfterIP)
8215 return AfterIP.takeError();
8216 Builder.restoreIP(*AfterIP);
8217 return Error::success();
8218 };
8219
8220 // Generate code for the closing of the data region.
8221 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8222 TargetDataRTArgs RTArgs;
8223 Info.EmitDebug = !MapInfo->Names.empty();
8224 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8225
8226 // Emit the number of elements in the offloading arrays.
8227 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8228
8229 // Source location for the ident struct
8230 if (!SrcLocInfo) {
8231 uint32_t SrcLocStrSize;
8232 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8233 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8234 }
8235
8236 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8237 PointerNum, RTArgs.BasePointersArray,
8238 RTArgs.PointersArray, RTArgs.SizesArray,
8239 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8240 RTArgs.MappersArray};
8241 Function *EndMapperFunc =
8242 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8243
8244 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8245 return Error::success();
8246 };
8247
8248 // We don't have to do anything to close the region if the if clause evaluates
8249 // to false.
8250 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8251 return Error::success();
8252 };
8253
8254 Error Err = [&]() -> Error {
8255 if (BodyGenCB) {
8256 Error Err = [&]() {
8257 if (IfCond)
8258 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8259 return BeginThenGen(AllocaIP, Builder.saveIP());
8260 }();
8261
8262 if (Err)
8263 return Err;
8264
8265 // If we don't require privatization of device pointers, we emit the body
8266 // in between the runtime calls. This avoids duplicating the body code.
8267 InsertPointOrErrorTy AfterIP =
8268 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8269 if (!AfterIP)
8270 return AfterIP.takeError();
8271 restoreIPandDebugLoc(Builder, *AfterIP);
8272
8273 if (IfCond)
8274 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8275 return EndThenGen(AllocaIP, Builder.saveIP());
8276 }
8277 if (IfCond)
8278 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8279 return BeginThenGen(AllocaIP, Builder.saveIP());
8280 }();
8281
8282 if (Err)
8283 return Err;
8284
8285 return Builder.saveIP();
8286}
8287
8290 bool IsGPUDistribute) {
8291 assert((IVSize == 32 || IVSize == 64) &&
8292 "IV size is not compatible with the omp runtime");
8293 RuntimeFunction Name;
8294 if (IsGPUDistribute)
8295 Name = IVSize == 32
8296 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8297 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8298 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8299 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8300 else
8301 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8302 : omp::OMPRTL___kmpc_for_static_init_4u)
8303 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8304 : omp::OMPRTL___kmpc_for_static_init_8u);
8305
8306 return getOrCreateRuntimeFunction(M, Name);
8307}
8308
8310 bool IVSigned) {
8311 assert((IVSize == 32 || IVSize == 64) &&
8312 "IV size is not compatible with the omp runtime");
8313 RuntimeFunction Name = IVSize == 32
8314 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8315 : omp::OMPRTL___kmpc_dispatch_init_4u)
8316 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8317 : omp::OMPRTL___kmpc_dispatch_init_8u);
8318
8319 return getOrCreateRuntimeFunction(M, Name);
8320}
8321
8323 bool IVSigned) {
8324 assert((IVSize == 32 || IVSize == 64) &&
8325 "IV size is not compatible with the omp runtime");
8326 RuntimeFunction Name = IVSize == 32
8327 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8328 : omp::OMPRTL___kmpc_dispatch_next_4u)
8329 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8330 : omp::OMPRTL___kmpc_dispatch_next_8u);
8331
8332 return getOrCreateRuntimeFunction(M, Name);
8333}
8334
8336 bool IVSigned) {
8337 assert((IVSize == 32 || IVSize == 64) &&
8338 "IV size is not compatible with the omp runtime");
8339 RuntimeFunction Name = IVSize == 32
8340 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8341 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8342 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8343 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8344
8345 return getOrCreateRuntimeFunction(M, Name);
8346}
8347
8349 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8350}
8351
8353 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8354 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8355
8356 DISubprogram *NewSP = Func->getSubprogram();
8357 if (!NewSP)
8358 return;
8359
8361
8362 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8363 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8364 // Only use cached variable if the arg number matches. This is important
8365 // so that DIVariable created for privatized variables are not discarded.
8366 if (NewVar && (arg == NewVar->getArg()))
8367 return NewVar;
8368
8370 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8371 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8372 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8373 return NewVar;
8374 };
8375
8376 auto UpdateDebugRecord = [&](auto *DR) {
8377 DILocalVariable *OldVar = DR->getVariable();
8378 unsigned ArgNo = 0;
8379 for (auto Loc : DR->location_ops()) {
8380 auto Iter = ValueReplacementMap.find(Loc);
8381 if (Iter != ValueReplacementMap.end()) {
8382 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8383 ArgNo = std::get<1>(Iter->second) + 1;
8384 }
8385 }
8386 if (ArgNo != 0)
8387 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8388 };
8389
8391 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8392 if (DVR->getNumVariableLocationOps() != 1u) {
8393 DVR->setKillLocation();
8394 return;
8395 }
8396 Value *Loc = DVR->getVariableLocationOp(0u);
8397 BasicBlock *CurBB = DVR->getParent();
8398 BasicBlock *RequiredBB = nullptr;
8399
8400 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8401 RequiredBB = LocInst->getParent();
8402 else if (isa<llvm::Argument>(Loc))
8403 RequiredBB = &DVR->getFunction()->getEntryBlock();
8404
8405 if (RequiredBB && RequiredBB != CurBB) {
8406 assert(!RequiredBB->empty());
8407 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8408 RequiredBB->back().getIterator());
8409 DVRsToDelete.push_back(DVR);
8410 }
8411 };
8412
8413 // The location and scope of variable intrinsics and records still point to
8414 // the parent function of the target region. Update them.
8415 for (Instruction &I : instructions(Func)) {
8417 "Unexpected debug intrinsic");
8418 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8419 UpdateDebugRecord(&DVR);
8420 MoveDebugRecordToCorrectBlock(&DVR);
8421 }
8422 }
8423 for (auto *DVR : DVRsToDelete)
8424 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8425 // An extra argument is passed to the device. Create the debug data for it.
8426 if (OMPBuilder.Config.isTargetDevice()) {
8427 DICompileUnit *CU = NewSP->getUnit();
8428 Module *M = Func->getParent();
8429 DIBuilder DB(*M, true, CU);
8430 DIType *VoidPtrTy =
8431 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8432 unsigned ArgNo = Func->arg_size();
8433 DILocalVariable *Var = DB.createParameterVariable(
8434 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8435 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8436 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8437 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8438 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8439 &(*Func->begin()));
8440 }
8441}
8442
8444 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8445 return cast<Operator>(V)->getOperand(0);
8446 return V;
8447}
8448
8450 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8452 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8455 SmallVector<Type *> ParameterTypes;
8456 if (OMPBuilder.Config.isTargetDevice()) {
8457 // All parameters to target devices are passed as pointers
8458 // or i64. This assumes 64-bit address spaces/pointers.
8459 for (auto &Arg : Inputs)
8460 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8461 ? Arg->getType()
8462 : Type::getInt64Ty(Builder.getContext()));
8463 } else {
8464 for (auto &Arg : Inputs)
8465 ParameterTypes.push_back(Arg->getType());
8466 }
8467
8468 // The implicit dyn_ptr argument is always the last parameter on both host
8469 // and device so the argument counts match without runtime manipulation.
8470 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8471 ParameterTypes.push_back(PtrTy);
8472
8473 auto BB = Builder.GetInsertBlock();
8474 auto M = BB->getModule();
8475 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8476 /*isVarArg*/ false);
8477 auto Func =
8478 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8479
8480 // Forward target-cpu and target-features function attributes from the
8481 // original function to the new outlined function.
8482 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8483
8484 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8485 if (TargetCpuAttr.isStringAttribute())
8486 Func->addFnAttr(TargetCpuAttr);
8487
8488 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8489 if (TargetFeaturesAttr.isStringAttribute())
8490 Func->addFnAttr(TargetFeaturesAttr);
8491
8492 if (OMPBuilder.Config.isTargetDevice()) {
8493 Value *ExecMode =
8494 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8495 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8496 }
8497
8498 // Save insert point.
8499 IRBuilder<>::InsertPointGuard IPG(Builder);
8500 // We will generate the entries in the outlined function but the debug
8501 // location may still be pointing to the parent function. Reset it now.
8502 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8503
8504 // Generate the region into the function.
8505 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8506 Builder.SetInsertPoint(EntryBB);
8507
8508 // Insert target init call in the device compilation pass.
8509 if (OMPBuilder.Config.isTargetDevice())
8510 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8511
8512 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8513
8514 // As we embed the user code in the middle of our target region after we
8515 // generate entry code, we must move what allocas we can into the entry
8516 // block to avoid possible breaking optimisations for device
8517 if (OMPBuilder.Config.isTargetDevice())
8519
8520 // Insert target deinit call in the device compilation pass.
8521 BasicBlock *OutlinedBodyBB =
8522 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8524 Builder.saveIP(),
8525 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8526 if (!AfterIP)
8527 return AfterIP.takeError();
8528 Builder.restoreIP(*AfterIP);
8529 if (OMPBuilder.Config.isTargetDevice())
8530 OMPBuilder.createTargetDeinit(Builder);
8531
8532 // Insert return instruction.
8533 Builder.CreateRetVoid();
8534
8535 // New Alloca IP at entry point of created device function.
8536 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8537 auto AllocaIP = Builder.saveIP();
8538
8539 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8540
8541 // Do not include the artificial dyn_ptr argument.
8542 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8543
8545
8546 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8547 // Things like GEP's can come in the form of Constants. Constants and
8548 // ConstantExpr's do not have access to the knowledge of what they're
8549 // contained in, so we must dig a little to find an instruction so we
8550 // can tell if they're used inside of the function we're outlining. We
8551 // also replace the original constant expression with a new instruction
8552 // equivalent; an instruction as it allows easy modification in the
8553 // following loop, as we can now know the constant (instruction) is
8554 // owned by our target function and replaceUsesOfWith can now be invoked
8555 // on it (cannot do this with constants it seems). A brand new one also
8556 // allows us to be cautious as it is perhaps possible the old expression
8557 // was used inside of the function but exists and is used externally
8558 // (unlikely by the nature of a Constant, but still).
8559 // NOTE: We cannot remove dead constants that have been rewritten to
8560 // instructions at this stage, we run the risk of breaking later lowering
8561 // by doing so as we could still be in the process of lowering the module
8562 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8563 // constants we have created rewritten versions of.
8564 if (auto *Const = dyn_cast<Constant>(Input))
8565 convertUsersOfConstantsToInstructions(Const, Func, false);
8566
8567 // Collect users before iterating over them to avoid invalidating the
8568 // iteration in case a user uses Input more than once (e.g. a call
8569 // instruction).
8570 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8571 // Collect all the instructions
8573 if (auto *Instr = dyn_cast<Instruction>(User))
8574 if (Instr->getFunction() == Func)
8575 Instr->replaceUsesOfWith(Input, InputCopy);
8576 };
8577
8578 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8579
8580 // Rewrite uses of input valus to parameters.
8581 for (auto InArg : zip(Inputs, ArgRange)) {
8582 Value *Input = std::get<0>(InArg);
8583 Argument &Arg = std::get<1>(InArg);
8584 Value *InputCopy = nullptr;
8585
8587 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8588 if (!AfterIP)
8589 return AfterIP.takeError();
8590 Builder.restoreIP(*AfterIP);
8591 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8592
8593 // In certain cases a Global may be set up for replacement, however, this
8594 // Global may be used in multiple arguments to the kernel, just segmented
8595 // apart, for example, if we have a global array, that is sectioned into
8596 // multiple mappings (technically not legal in OpenMP, but there is a case
8597 // in Fortran for Common Blocks where this is neccesary), we will end up
8598 // with GEP's into this array inside the kernel, that refer to the Global
8599 // but are technically separate arguments to the kernel for all intents and
8600 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8601 // index, it will fold into an referal to the Global, if we then encounter
8602 // this folded GEP during replacement all of the references to the
8603 // Global in the kernel will be replaced with the argument we have generated
8604 // that corresponds to it, including any other GEP's that refer to the
8605 // Global that may be other arguments. This will invalidate all of the other
8606 // preceding mapped arguments that refer to the same global that may be
8607 // separate segments. To prevent this, we defer global processing until all
8608 // other processing has been performed.
8611 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8612 continue;
8613 }
8614
8616 continue;
8617
8618 ReplaceValue(Input, InputCopy, Func);
8619 }
8620
8621 // Replace all of our deferred Input values, currently just Globals.
8622 for (auto Deferred : DeferredReplacement)
8623 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8624
8625 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8626 ValueReplacementMap);
8627 return Func;
8628}
8629/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8630/// of pointers containing shared data between the parent task and the created
8631/// task.
8633 IRBuilderBase &Builder,
8634 Value *TaskWithPrivates,
8635 Type *TaskWithPrivatesTy) {
8636
8637 Type *TaskTy = OMPIRBuilder.Task;
8638 LLVMContext &Ctx = Builder.getContext();
8639 Value *TaskT =
8640 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8641 Value *Shareds = TaskT;
8642 // TaskWithPrivatesTy can be one of the following
8643 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8644 // %struct.privates }
8645 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8646 //
8647 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8648 // its first member has to be the task descriptor. TaskTy is the type of the
8649 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8650 // first member of TaskT, gives us the pointer to shared data.
8651 if (TaskWithPrivatesTy != TaskTy)
8652 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8653 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8654}
8655/// Create an entry point for a target task with the following.
8656/// It'll have the following signature
8657/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8658/// This function is called from emitTargetTask once the
8659/// code to launch the target kernel has been outlined already.
8660/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8661/// into the task structure so that the deferred target task can access this
8662/// data even after the stack frame of the generating task has been rolled
8663/// back. Offloading arrays contain base pointers, pointers, sizes etc
8664/// of the data that the target kernel will access. These in effect are the
8665/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8667 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8668 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8669 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8670
8671 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8672 // This is because PrivatesTy is the type of the structure in which
8673 // we pass the offloading arrays to the deferred target task.
8674 assert((!NumOffloadingArrays || PrivatesTy) &&
8675 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8676 "to privatize");
8677
8678 Module &M = OMPBuilder.M;
8679 // KernelLaunchFunction is the target launch function, i.e.
8680 // the function that sets up kernel arguments and calls
8681 // __tgt_target_kernel to launch the kernel on the device.
8682 //
8683 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8684
8685 // StaleCI is the CallInst which is the call to the outlined
8686 // target kernel launch function. If there are local live-in values
8687 // that the outlined function uses then these are aggregated into a structure
8688 // which is passed as the second argument. If there are no local live-in
8689 // values or if all values used by the outlined kernel are global variables,
8690 // then there's only one argument, the threadID. So, StaleCI can be
8691 //
8692 // %structArg = alloca { ptr, ptr }, align 8
8693 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8694 // store ptr %20, ptr %gep_, align 8
8695 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8696 // store ptr %21, ptr %gep_8, align 8
8697 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8698 //
8699 // OR
8700 //
8701 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8703 StaleCI->getIterator());
8704
8705 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8706
8707 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8708 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8709 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8710
8711 auto ProxyFnTy =
8712 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8713 /* isVarArg */ false);
8714 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8715 ".omp_target_task_proxy_func",
8716 Builder.GetInsertBlock()->getModule());
8717 Value *ThreadId = ProxyFn->getArg(0);
8718 Value *TaskWithPrivates = ProxyFn->getArg(1);
8719 ThreadId->setName("thread.id");
8720 TaskWithPrivates->setName("task");
8721
8722 bool HasShareds = SharedArgsOperandNo > 0;
8723 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8724 BasicBlock *EntryBB =
8725 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8726 Builder.SetInsertPoint(EntryBB);
8727
8728 SmallVector<Value *> KernelLaunchArgs;
8729 KernelLaunchArgs.reserve(StaleCI->arg_size());
8730 KernelLaunchArgs.push_back(ThreadId);
8731
8732 if (HasOffloadingArrays) {
8733 assert(TaskTy != TaskWithPrivatesTy &&
8734 "If there are offloading arrays to pass to the target"
8735 "TaskTy cannot be the same as TaskWithPrivatesTy");
8736 (void)TaskTy;
8737 Value *Privates =
8738 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8739 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8740 KernelLaunchArgs.push_back(
8741 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8742 }
8743
8744 if (HasShareds) {
8745 auto *ArgStructAlloca =
8746 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8747 assert(ArgStructAlloca &&
8748 "Unable to find the alloca instruction corresponding to arguments "
8749 "for extracted function");
8750 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8751 std::optional<TypeSize> ArgAllocSize =
8752 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8753 assert(ArgStructType && ArgAllocSize &&
8754 "Unable to determine size of arguments for extracted function");
8755 uint64_t StructSize = ArgAllocSize->getFixedValue();
8756
8757 AllocaInst *NewArgStructAlloca =
8758 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8759
8760 Value *SharedsSize = Builder.getInt64(StructSize);
8761
8763 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8764
8765 Builder.CreateMemCpy(
8766 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8767 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8768 KernelLaunchArgs.push_back(NewArgStructAlloca);
8769 }
8770 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8771 Builder.CreateRetVoid();
8772 return ProxyFn;
8773}
8775
8776 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8777 return GEP->getSourceElementType();
8778 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8779 return Alloca->getAllocatedType();
8780
8781 llvm_unreachable("Unhandled Instruction type");
8782 return nullptr;
8783}
8784// This function returns a struct that has at most two members.
8785// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8786// descriptor. The second member, if needed, is a struct containing arrays
8787// that need to be passed to the offloaded target kernel. For example,
8788// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8789// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8790// respectively, then the types created by this function are
8791//
8792// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8793// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8794// %struct.privates }
8795// %struct.task_with_privates is returned by this function.
8796// If there aren't any offloading arrays to pass to the target kernel,
8797// %struct.kmp_task_ompbuilder_t is returned.
8798static StructType *
8800 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8801
8802 if (OffloadingArraysToPrivatize.empty())
8803 return OMPIRBuilder.Task;
8804
8805 SmallVector<Type *, 4> StructFieldTypes;
8806 for (Value *V : OffloadingArraysToPrivatize) {
8807 assert(V->getType()->isPointerTy() &&
8808 "Expected pointer to array to privatize. Got a non-pointer value "
8809 "instead");
8810 Type *ArrayTy = getOffloadingArrayType(V);
8811 assert(ArrayTy && "ArrayType cannot be nullptr");
8812 StructFieldTypes.push_back(ArrayTy);
8813 }
8814 StructType *PrivatesStructTy =
8815 StructType::create(StructFieldTypes, "struct.privates");
8816 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8817 "struct.task_with_privates");
8818}
8820 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8821 TargetRegionEntryInfo &EntryInfo,
8823 Function *&OutlinedFn, Constant *&OutlinedFnID,
8827
8828 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8829 [&](StringRef EntryFnName) {
8830 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8831 EntryFnName, Inputs, CBFunc,
8832 ArgAccessorFuncCB);
8833 };
8834
8835 return OMPBuilder.emitTargetRegionFunction(
8836 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8837 OutlinedFnID);
8838}
8839
8841 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8844 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8845
8846 // The following explains the code-gen scenario for the `target` directive. A
8847 // similar scneario is followed for other device-related directives (e.g.
8848 // `target enter data`) but in similar fashion since we only need to emit task
8849 // that encapsulates the proper runtime call.
8850 //
8851 // When we arrive at this function, the target region itself has been
8852 // outlined into the function OutlinedFn.
8853 // So at ths point, for
8854 // --------------------------------------------------------------
8855 // void user_code_that_offloads(...) {
8856 // omp target depend(..) map(from:a) map(to:b) private(i)
8857 // do i = 1, 10
8858 // a(i) = b(i) + n
8859 // }
8860 //
8861 // --------------------------------------------------------------
8862 //
8863 // we have
8864 //
8865 // --------------------------------------------------------------
8866 //
8867 // void user_code_that_offloads(...) {
8868 // %.offload_baseptrs = alloca [2 x ptr], align 8
8869 // %.offload_ptrs = alloca [2 x ptr], align 8
8870 // %.offload_mappers = alloca [2 x ptr], align 8
8871 // ;; target region has been outlined and now we need to
8872 // ;; offload to it via a target task.
8873 // }
8874 // void outlined_device_function(ptr a, ptr b, ptr n) {
8875 // n = *n_ptr;
8876 // do i = 1, 10
8877 // a(i) = b(i) + n
8878 // }
8879 //
8880 // We have to now do the following
8881 // (i) Make an offloading call to outlined_device_function using the OpenMP
8882 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8883 // emitted by emitKernelLaunch
8884 // (ii) Create a task entry point function that calls kernel_launch_function
8885 // and is the entry point for the target task. See
8886 // '@.omp_target_task_proxy_func in the pseudocode below.
8887 // (iii) Create a task with the task entry point created in (ii)
8888 //
8889 // That is we create the following
8890 // struct task_with_privates {
8891 // struct kmp_task_ompbuilder_t task_struct;
8892 // struct privates {
8893 // [2 x ptr] ; baseptrs
8894 // [2 x ptr] ; ptrs
8895 // [2 x i64] ; sizes
8896 // }
8897 // }
8898 // void user_code_that_offloads(...) {
8899 // %.offload_baseptrs = alloca [2 x ptr], align 8
8900 // %.offload_ptrs = alloca [2 x ptr], align 8
8901 // %.offload_sizes = alloca [2 x i64], align 8
8902 //
8903 // %structArg = alloca { ptr, ptr, ptr }, align 8
8904 // %strucArg[0] = a
8905 // %strucArg[1] = b
8906 // %strucArg[2] = &n
8907 //
8908 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8909 // sizeof(kmp_task_ompbuilder_t),
8910 // sizeof(structArg),
8911 // @.omp_target_task_proxy_func,
8912 // ...)
8913 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8914 // sizeof(structArg))
8915 // memcpy(target_task_with_privates->privates->baseptrs,
8916 // offload_baseptrs, sizeof(offload_baseptrs)
8917 // memcpy(target_task_with_privates->privates->ptrs,
8918 // offload_ptrs, sizeof(offload_ptrs)
8919 // memcpy(target_task_with_privates->privates->sizes,
8920 // offload_sizes, sizeof(offload_sizes)
8921 // dependencies_array = ...
8922 // ;; if nowait not present
8923 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8924 // call @__kmpc_omp_task_begin_if0(...)
8925 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8926 // %target_task_with_privates)
8927 // call @__kmpc_omp_task_complete_if0(...)
8928 // }
8929 //
8930 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8931 // ptr %task) {
8932 // %structArg = alloca {ptr, ptr, ptr}
8933 // %task_ptr = getelementptr(%task, 0, 0)
8934 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8935 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8936 //
8937 // %offloading_arrays = getelementptr(%task, 0, 1)
8938 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8939 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8940 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8941 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8942 // %offload_sizes, %structArg)
8943 // }
8944 //
8945 // We need the proxy function because the signature of the task entry point
8946 // expected by kmpc_omp_task is always the same and will be different from
8947 // that of the kernel_launch function.
8948 //
8949 // kernel_launch_function is generated by emitKernelLaunch and has the
8950 // always_inline attribute. For this example, it'll look like so:
8951 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8952 // %offload_sizes, %structArg) alwaysinline {
8953 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8954 // ; load aggregated data from %structArg
8955 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8956 // ; offload_sizes
8957 // call i32 @__tgt_target_kernel(...,
8958 // outlined_device_function,
8959 // ptr %kernel_args)
8960 // }
8961 // void outlined_device_function(ptr a, ptr b, ptr n) {
8962 // n = *n_ptr;
8963 // do i = 1, 10
8964 // a(i) = b(i) + n
8965 // }
8966 //
8967 BasicBlock *TargetTaskBodyBB =
8968 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8969 BasicBlock *TargetTaskAllocaBB =
8970 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8971
8972 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8973 TargetTaskAllocaBB->begin());
8974 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8975
8976 OutlineInfo OI;
8977 OI.EntryBB = TargetTaskAllocaBB;
8978 OI.OuterAllocaBB = AllocaIP.getBlock();
8979
8980 // Add the thread ID argument.
8983 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8984
8985 // Generate the task body which will subsequently be outlined.
8986 Builder.restoreIP(TargetTaskBodyIP);
8987 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8988 return Err;
8989
8990 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8991 // it is given. These blocks are enumerated by
8992 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8993 // to be outside the region. In other words, OI.ExitBlock is expected to be
8994 // the start of the region after the outlining. We used to set OI.ExitBlock
8995 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8996 // except when the task body is a single basic block. In that case,
8997 // OI.ExitBlock is set to the single task body block and will get left out of
8998 // the outlining process. So, simply create a new empty block to which we
8999 // uncoditionally branch from where TaskBodyCB left off
9000 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9001 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
9002 /*IsFinished=*/true);
9003
9004 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9005 bool NeedsTargetTask = HasNoWait && DeviceID;
9006 if (NeedsTargetTask) {
9007 for (auto *V :
9008 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9009 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9010 RTArgs.SizesArray}) {
9012 OffloadingArraysToPrivatize.push_back(V);
9014 }
9015 }
9016 }
9017 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9018 DeviceID, OffloadingArraysToPrivatize](
9019 Function &OutlinedFn) mutable {
9020 assert(OutlinedFn.hasOneUse() &&
9021 "there must be a single user for the outlined function");
9022
9023 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9024
9025 // The first argument of StaleCI is always the thread id.
9026 // The next few arguments are the pointers to offloading arrays
9027 // if any. (see OffloadingArraysToPrivatize)
9028 // Finally, all other local values that are live-in into the outlined region
9029 // end up in a structure whose pointer is passed as the last argument. This
9030 // piece of data is passed in the "shared" field of the task structure. So,
9031 // we know we have to pass shareds to the task if the number of arguments is
9032 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9033 // thread id. Further, for safety, we assert that the number of arguments of
9034 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9035 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9036 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9037 assert((!HasShareds ||
9038 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9039 "Wrong number of arguments for StaleCI when shareds are present");
9040 int SharedArgOperandNo =
9041 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9042
9043 StructType *TaskWithPrivatesTy =
9044 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9045 StructType *PrivatesTy = nullptr;
9046
9047 if (!OffloadingArraysToPrivatize.empty())
9048 PrivatesTy =
9049 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9050
9052 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9053 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9054
9055 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9056 << "\n");
9057
9058 Builder.SetInsertPoint(StaleCI);
9059
9060 // Gather the arguments for emitting the runtime call.
9061 uint32_t SrcLocStrSize;
9062 Constant *SrcLocStr =
9064 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9065
9066 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9067 //
9068 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9069 // the DeviceID to the deferred task and also since
9070 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9071 Function *TaskAllocFn =
9072 !NeedsTargetTask
9073 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9075 OMPRTL___kmpc_omp_target_task_alloc);
9076
9077 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9078 // call.
9079 Value *ThreadID = getOrCreateThreadID(Ident);
9080
9081 // Argument - `sizeof_kmp_task_t` (TaskSize)
9082 // Tasksize refers to the size in bytes of kmp_task_t data structure
9083 // plus any other data to be passed to the target task, if any, which
9084 // is packed into a struct. kmp_task_t and the struct so created are
9085 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9086 Value *TaskSize = Builder.getInt64(
9087 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9088
9089 // Argument - `sizeof_shareds` (SharedsSize)
9090 // SharedsSize refers to the shareds array size in the kmp_task_t data
9091 // structure.
9092 Value *SharedsSize = Builder.getInt64(0);
9093 if (HasShareds) {
9094 auto *ArgStructAlloca =
9095 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9096 assert(ArgStructAlloca &&
9097 "Unable to find the alloca instruction corresponding to arguments "
9098 "for extracted function");
9099 std::optional<TypeSize> ArgAllocSize =
9100 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9101 assert(ArgAllocSize &&
9102 "Unable to determine size of arguments for extracted function");
9103 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9104 }
9105
9106 // Argument - `flags`
9107 // Task is tied iff (Flags & 1) == 1.
9108 // Task is untied iff (Flags & 1) == 0.
9109 // Task is final iff (Flags & 2) == 2.
9110 // Task is not final iff (Flags & 2) == 0.
9111 // A target task is not final and is untied.
9112 Value *Flags = Builder.getInt32(0);
9113
9114 // Emit the @__kmpc_omp_task_alloc runtime call
9115 // The runtime call returns a pointer to an area where the task captured
9116 // variables must be copied before the task is run (TaskData)
9117 CallInst *TaskData = nullptr;
9118
9119 SmallVector<llvm::Value *> TaskAllocArgs = {
9120 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9121 /*flags=*/Flags,
9122 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9123 /*task_func=*/ProxyFn};
9124
9125 if (NeedsTargetTask) {
9126 assert(DeviceID && "Expected non-empty device ID.");
9127 TaskAllocArgs.push_back(DeviceID);
9128 }
9129
9130 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9131
9132 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9133 if (HasShareds) {
9134 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9136 *this, Builder, TaskData, TaskWithPrivatesTy);
9137 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9138 SharedsSize);
9139 }
9140 if (!OffloadingArraysToPrivatize.empty()) {
9141 Value *Privates =
9142 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9143 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9144 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9145 [[maybe_unused]] Type *ArrayType =
9146 getOffloadingArrayType(PtrToPrivatize);
9147 assert(ArrayType && "ArrayType cannot be nullptr");
9148
9149 Type *ElementType = PrivatesTy->getElementType(i);
9150 assert(ElementType == ArrayType &&
9151 "ElementType should match ArrayType");
9152 (void)ArrayType;
9153
9154 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9155 Builder.CreateMemCpy(
9156 Dst, Alignment, PtrToPrivatize, Alignment,
9157 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9158 }
9159 }
9160
9161 Value *DepArray = emitTaskDependencies(*this, Dependencies);
9162
9163 // ---------------------------------------------------------------
9164 // V5.2 13.8 target construct
9165 // If the nowait clause is present, execution of the target task
9166 // may be deferred. If the nowait clause is not present, the target task is
9167 // an included task.
9168 // ---------------------------------------------------------------
9169 // The above means that the lack of a nowait on the target construct
9170 // translates to '#pragma omp task if(0)'
9171 if (!NeedsTargetTask) {
9172 if (DepArray) {
9173 Function *TaskWaitFn =
9174 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9176 TaskWaitFn,
9177 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9178 /*ndeps=*/Builder.getInt32(Dependencies.size()),
9179 /*dep_list=*/DepArray,
9180 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9181 /*noalias_dep_list=*/
9183 }
9184 // Included task.
9185 Function *TaskBeginFn =
9186 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9187 Function *TaskCompleteFn =
9188 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9189 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9190 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9191 CI->setDebugLoc(StaleCI->getDebugLoc());
9192 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9193 } else if (DepArray) {
9194 // HasNoWait - meaning the task may be deferred. Call
9195 // __kmpc_omp_task_with_deps if there are dependencies,
9196 // else call __kmpc_omp_task
9197 Function *TaskFn =
9198 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9200 TaskFn,
9201 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
9202 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
9204 } else {
9205 // Emit the @__kmpc_omp_task runtime call to spawn the task
9206 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9207 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9208 }
9209
9210 StaleCI->eraseFromParent();
9211 for (Instruction *I : llvm::reverse(ToBeDeleted))
9212 I->eraseFromParent();
9213 };
9214 addOutlineInfo(std::move(OI));
9215
9216 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9217 << *(Builder.GetInsertBlock()) << "\n");
9218 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9219 << *(Builder.GetInsertBlock()->getParent()->getParent())
9220 << "\n");
9221 return Builder.saveIP();
9222}
9223
9225 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9226 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9227 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9228 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9229 if (Error Err =
9230 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9231 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9232 return Err;
9233 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9234 return Error::success();
9235}
9236
9237static void emitTargetCall(
9238 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9243 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9248 bool HasNoWait, Value *DynCGroupMem,
9249 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9250 // Generate a function call to the host fallback implementation of the target
9251 // region. This is called by the host when no offload entry was generated for
9252 // the target region and when the offloading call fails at runtime.
9253 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9255 Builder.restoreIP(IP);
9256 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9257 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9258 FallbackArgs.push_back(
9259 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9260 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9261 return Builder.saveIP();
9262 };
9263
9264 bool HasDependencies = Dependencies.size() > 0;
9265 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9266
9268
9269 auto TaskBodyCB =
9270 [&](Value *DeviceID, Value *RTLoc,
9271 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9272 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9273 // produce any.
9275 // emitKernelLaunch makes the necessary runtime call to offload the
9276 // kernel. We then outline all that code into a separate function
9277 // ('kernel_launch_function' in the pseudo code above). This function is
9278 // then called by the target task proxy function (see
9279 // '@.omp_target_task_proxy_func' in the pseudo code above)
9280 // "@.omp_target_task_proxy_func' is generated by
9281 // emitTargetTaskProxyFunction.
9282 if (OutlinedFnID && DeviceID)
9283 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9284 EmitTargetCallFallbackCB, KArgs,
9285 DeviceID, RTLoc, TargetTaskAllocaIP);
9286
9287 // We only need to do the outlining if `DeviceID` is set to avoid calling
9288 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9289 // generating the `else` branch of an `if` clause.
9290 //
9291 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9292 // In this case, we execute the host implementation directly.
9293 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9294 }());
9295
9296 OMPBuilder.Builder.restoreIP(AfterIP);
9297 return Error::success();
9298 };
9299
9300 auto &&EmitTargetCallElse =
9301 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9303 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9304 // produce any.
9306 if (RequiresOuterTargetTask) {
9307 // Arguments that are intended to be directly forwarded to an
9308 // emitKernelLaunch call are pased as nullptr, since
9309 // OutlinedFnID=nullptr results in that call not being done.
9311 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9312 /*RTLoc=*/nullptr, AllocaIP,
9313 Dependencies, EmptyRTArgs, HasNoWait);
9314 }
9315 return EmitTargetCallFallbackCB(Builder.saveIP());
9316 }());
9317
9318 Builder.restoreIP(AfterIP);
9319 return Error::success();
9320 };
9321
9322 auto &&EmitTargetCallThen =
9323 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9325 Info.HasNoWait = HasNoWait;
9326 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9327
9329 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9330 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9331 /*IsNonContiguous=*/true,
9332 /*ForEndCall=*/false))
9333 return Err;
9334
9335 SmallVector<Value *, 3> NumTeamsC;
9336 for (auto [DefaultVal, RuntimeVal] :
9337 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9338 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9339 : Builder.getInt32(DefaultVal));
9340
9341 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9342 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9343 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9344 if (Clause)
9345 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9346 /*isSigned=*/false);
9347 return Clause;
9348 };
9349 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9350 if (Clause)
9351 Result =
9352 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9353 Result, Clause)
9354 : Clause;
9355 };
9356
9357 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9358 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9359 SmallVector<Value *, 3> NumThreadsC;
9360 Value *MaxThreadsClause =
9361 RuntimeAttrs.TeamsThreadLimit.size() == 1
9362 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9363 : nullptr;
9364
9365 for (auto [TeamsVal, TargetVal] : zip_equal(
9366 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9367 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9368 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9369
9370 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9371 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9372
9373 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9374 }
9375
9376 unsigned NumTargetItems = Info.NumberOfPtrs;
9377 uint32_t SrcLocStrSize;
9378 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9379 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9380 llvm::omp::IdentFlag(0), 0);
9381
9382 Value *TripCount = RuntimeAttrs.LoopTripCount
9383 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9384 Builder.getInt64Ty(),
9385 /*isSigned=*/false)
9386 : Builder.getInt64(0);
9387
9388 // Request zero groupprivate bytes by default.
9389 if (!DynCGroupMem)
9390 DynCGroupMem = Builder.getInt32(0);
9391
9393 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9394 HasNoWait, DynCGroupMemFallback);
9395
9396 // Assume no error was returned because TaskBodyCB and
9397 // EmitTargetCallFallbackCB don't produce any.
9399 // The presence of certain clauses on the target directive require the
9400 // explicit generation of the target task.
9401 if (RequiresOuterTargetTask)
9402 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9403 RTLoc, AllocaIP, Dependencies,
9404 KArgs.RTArgs, Info.HasNoWait);
9405
9406 return OMPBuilder.emitKernelLaunch(
9407 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9408 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9409 }());
9410
9411 Builder.restoreIP(AfterIP);
9412 return Error::success();
9413 };
9414
9415 // If we don't have an ID for the target region, it means an offload entry
9416 // wasn't created. In this case we just run the host fallback directly and
9417 // ignore any potential 'if' clauses.
9418 if (!OutlinedFnID) {
9419 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9420 return;
9421 }
9422
9423 // If there's no 'if' clause, only generate the kernel launch code path.
9424 if (!IfCond) {
9425 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9426 return;
9427 }
9428
9429 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9430 EmitTargetCallElse, AllocaIP));
9431}
9432
9434 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9435 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9436 TargetRegionEntryInfo &EntryInfo,
9437 const TargetKernelDefaultAttrs &DefaultAttrs,
9438 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9439 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9442 CustomMapperCallbackTy CustomMapperCB,
9443 const SmallVector<DependData> &Dependencies, bool HasNowait,
9444 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9445
9446 if (!updateToLocation(Loc))
9447 return InsertPointTy();
9448
9449 Builder.restoreIP(CodeGenIP);
9450
9451 Function *OutlinedFn;
9452 Constant *OutlinedFnID = nullptr;
9453 // The target region is outlined into its own function. The LLVM IR for
9454 // the target region itself is generated using the callbacks CBFunc
9455 // and ArgAccessorFuncCB
9457 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9458 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9459 return Err;
9460
9461 // If we are not on the target device, then we need to generate code
9462 // to make a remote call (offload) to the previously outlined function
9463 // that represents the target region. Do that now.
9464 if (!Config.isTargetDevice())
9465 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9466 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9467 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9468 DynCGroupMemFallback);
9469 return Builder.saveIP();
9470}
9471
9472std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9473 StringRef FirstSeparator,
9474 StringRef Separator) {
9475 SmallString<128> Buffer;
9476 llvm::raw_svector_ostream OS(Buffer);
9477 StringRef Sep = FirstSeparator;
9478 for (StringRef Part : Parts) {
9479 OS << Sep << Part;
9480 Sep = Separator;
9481 }
9482 return OS.str().str();
9483}
9484
9485std::string
9487 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9488 Config.separator());
9489}
9490
9492 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9493 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9494 if (Elem.second) {
9495 assert(Elem.second->getValueType() == Ty &&
9496 "OMP internal variable has different type than requested");
9497 } else {
9498 // TODO: investigate the appropriate linkage type used for the global
9499 // variable for possibly changing that to internal or private, or maybe
9500 // create different versions of the function for different OMP internal
9501 // variables.
9502 const DataLayout &DL = M.getDataLayout();
9503 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9504 // default global AS is 1.
9505 // See double-target-call-with-declare-target.f90 and
9506 // declare-target-vars-in-target-region.f90 libomptarget
9507 // tests.
9508 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9509 : M.getTargetTriple().isAMDGPU()
9510 ? 0
9511 : DL.getDefaultGlobalsAddressSpace();
9512 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9515 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9516 Constant::getNullValue(Ty), Elem.first(),
9517 /*InsertBefore=*/nullptr,
9518 GlobalValue::NotThreadLocal, AddressSpaceVal);
9519 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9520 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9521 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9522 Elem.second = GV;
9523 }
9524
9525 return Elem.second;
9526}
9527
9528Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9529 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9530 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9531 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9532}
9533
9535 LLVMContext &Ctx = Builder.getContext();
9536 Value *Null =
9537 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9538 Value *SizeGep =
9539 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9540 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9541 return SizePtrToInt;
9542}
9543
9546 std::string VarName) {
9547 llvm::Constant *MaptypesArrayInit =
9548 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9549 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9550 M, MaptypesArrayInit->getType(),
9551 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9552 VarName);
9553 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9554 return MaptypesArrayGlobal;
9555}
9556
9558 InsertPointTy AllocaIP,
9559 unsigned NumOperands,
9560 struct MapperAllocas &MapperAllocas) {
9561 if (!updateToLocation(Loc))
9562 return;
9563
9564 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9565 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9566 Builder.restoreIP(AllocaIP);
9567 AllocaInst *ArgsBase = Builder.CreateAlloca(
9568 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9569 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9570 ".offload_ptrs");
9571 AllocaInst *ArgSizes = Builder.CreateAlloca(
9572 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9574 MapperAllocas.ArgsBase = ArgsBase;
9575 MapperAllocas.Args = Args;
9576 MapperAllocas.ArgSizes = ArgSizes;
9577}
9578
9580 Function *MapperFunc, Value *SrcLocInfo,
9581 Value *MaptypesArg, Value *MapnamesArg,
9583 int64_t DeviceID, unsigned NumOperands) {
9584 if (!updateToLocation(Loc))
9585 return;
9586
9587 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9588 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9589 Value *ArgsBaseGEP =
9590 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9591 {Builder.getInt32(0), Builder.getInt32(0)});
9592 Value *ArgsGEP =
9593 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9594 {Builder.getInt32(0), Builder.getInt32(0)});
9595 Value *ArgSizesGEP =
9596 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9597 {Builder.getInt32(0), Builder.getInt32(0)});
9598 Value *NullPtr =
9599 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9600 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9601 Builder.getInt32(NumOperands),
9602 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9603 MaptypesArg, MapnamesArg, NullPtr});
9604}
9605
9607 TargetDataRTArgs &RTArgs,
9608 TargetDataInfo &Info,
9609 bool ForEndCall) {
9610 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9611 "expected region end call to runtime only when end call is separate");
9612 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9613 auto VoidPtrTy = UnqualPtrTy;
9614 auto VoidPtrPtrTy = UnqualPtrTy;
9615 auto Int64Ty = Type::getInt64Ty(M.getContext());
9616 auto Int64PtrTy = UnqualPtrTy;
9617
9618 if (!Info.NumberOfPtrs) {
9619 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9620 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9621 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9622 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9623 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9624 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9625 return;
9626 }
9627
9628 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9629 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9630 Info.RTArgs.BasePointersArray,
9631 /*Idx0=*/0, /*Idx1=*/0);
9632 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9633 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9634 /*Idx0=*/0,
9635 /*Idx1=*/0);
9636 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9637 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9638 /*Idx0=*/0, /*Idx1=*/0);
9639 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9640 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9641 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9642 : Info.RTArgs.MapTypesArray,
9643 /*Idx0=*/0,
9644 /*Idx1=*/0);
9645
9646 // Only emit the mapper information arrays if debug information is
9647 // requested.
9648 if (!Info.EmitDebug)
9649 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9650 else
9651 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9652 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9653 /*Idx0=*/0,
9654 /*Idx1=*/0);
9655 // If there is no user-defined mapper, set the mapper array to nullptr to
9656 // avoid an unnecessary data privatization
9657 if (!Info.HasMapper)
9658 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9659 else
9660 RTArgs.MappersArray =
9661 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9662}
9663
9665 InsertPointTy CodeGenIP,
9666 MapInfosTy &CombinedInfo,
9667 TargetDataInfo &Info) {
9669 CombinedInfo.NonContigInfo;
9670
9671 // Build an array of struct descriptor_dim and then assign it to
9672 // offload_args.
9673 //
9674 // struct descriptor_dim {
9675 // uint64_t offset;
9676 // uint64_t count;
9677 // uint64_t stride
9678 // };
9679 Type *Int64Ty = Builder.getInt64Ty();
9681 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9682 "struct.descriptor_dim");
9683
9684 enum { OffsetFD = 0, CountFD, StrideFD };
9685 // We need two index variable here since the size of "Dims" is the same as
9686 // the size of Components, however, the size of offset, count, and stride is
9687 // equal to the size of base declaration that is non-contiguous.
9688 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9689 // Skip emitting ir if dimension size is 1 since it cannot be
9690 // non-contiguous.
9691 if (NonContigInfo.Dims[I] == 1)
9692 continue;
9693 Builder.restoreIP(AllocaIP);
9694 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9695 AllocaInst *DimsAddr =
9696 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9697 Builder.restoreIP(CodeGenIP);
9698 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9699 unsigned RevIdx = EE - II - 1;
9700 Value *DimsLVal = Builder.CreateInBoundsGEP(
9701 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9702 // Offset
9703 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9704 Builder.CreateAlignedStore(
9705 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9706 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9707 // Count
9708 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9709 Builder.CreateAlignedStore(
9710 NonContigInfo.Counts[L][RevIdx], CountLVal,
9711 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9712 // Stride
9713 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9714 Builder.CreateAlignedStore(
9715 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9716 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9717 }
9718 // args[I] = &dims
9719 Builder.restoreIP(CodeGenIP);
9720 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9721 DimsAddr, Builder.getPtrTy());
9722 Value *P = Builder.CreateConstInBoundsGEP2_32(
9723 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9724 Info.RTArgs.PointersArray, 0, I);
9725 Builder.CreateAlignedStore(
9726 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9727 ++L;
9728 }
9729}
9730
9731void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9732 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9733 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9734 BasicBlock *ExitBB, bool IsInit) {
9735 StringRef Prefix = IsInit ? ".init" : ".del";
9736
9737 // Evaluate if this is an array section.
9739 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9740 Value *IsArray =
9741 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9742 Value *DeleteBit = Builder.CreateAnd(
9743 MapType,
9744 Builder.getInt64(
9745 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9746 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9747 Value *DeleteCond;
9748 Value *Cond;
9749 if (IsInit) {
9750 // base != begin?
9751 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9752 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9753 DeleteCond = Builder.CreateIsNull(
9754 DeleteBit,
9755 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9756 } else {
9757 Cond = IsArray;
9758 DeleteCond = Builder.CreateIsNotNull(
9759 DeleteBit,
9760 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9761 }
9762 Cond = Builder.CreateAnd(Cond, DeleteCond);
9763 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9764
9765 emitBlock(BodyBB, MapperFn);
9766 // Get the array size by multiplying element size and element number (i.e., \p
9767 // Size).
9768 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9769 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9770 // memory allocation/deletion purpose only.
9771 Value *MapTypeArg = Builder.CreateAnd(
9772 MapType,
9773 Builder.getInt64(
9774 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9775 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9776 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9777 MapTypeArg = Builder.CreateOr(
9778 MapTypeArg,
9779 Builder.getInt64(
9780 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9781 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9782
9783 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9784 // data structure.
9785 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9786 ArraySize, MapTypeArg, MapName};
9788 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9789 OffloadingArgs);
9790}
9791
9794 llvm::Value *BeginArg)>
9795 GenMapInfoCB,
9796 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9797 SmallVector<Type *> Params;
9798 Params.emplace_back(Builder.getPtrTy());
9799 Params.emplace_back(Builder.getPtrTy());
9800 Params.emplace_back(Builder.getPtrTy());
9801 Params.emplace_back(Builder.getInt64Ty());
9802 Params.emplace_back(Builder.getInt64Ty());
9803 Params.emplace_back(Builder.getPtrTy());
9804
9805 auto *FnTy =
9806 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9807
9808 SmallString<64> TyStr;
9809 raw_svector_ostream Out(TyStr);
9810 Function *MapperFn =
9812 MapperFn->addFnAttr(Attribute::NoInline);
9813 MapperFn->addFnAttr(Attribute::NoUnwind);
9814 MapperFn->addParamAttr(0, Attribute::NoUndef);
9815 MapperFn->addParamAttr(1, Attribute::NoUndef);
9816 MapperFn->addParamAttr(2, Attribute::NoUndef);
9817 MapperFn->addParamAttr(3, Attribute::NoUndef);
9818 MapperFn->addParamAttr(4, Attribute::NoUndef);
9819 MapperFn->addParamAttr(5, Attribute::NoUndef);
9820
9821 // Start the mapper function code generation.
9822 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9823 auto SavedIP = Builder.saveIP();
9824 Builder.SetInsertPoint(EntryBB);
9825
9826 Value *MapperHandle = MapperFn->getArg(0);
9827 Value *BaseIn = MapperFn->getArg(1);
9828 Value *BeginIn = MapperFn->getArg(2);
9829 Value *Size = MapperFn->getArg(3);
9830 Value *MapType = MapperFn->getArg(4);
9831 Value *MapName = MapperFn->getArg(5);
9832
9833 // Compute the starting and end addresses of array elements.
9834 // Prepare common arguments for array initiation and deletion.
9835 // Convert the size in bytes into the number of array elements.
9836 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9837 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9838 Value *PtrBegin = BeginIn;
9839 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9840
9841 // Emit array initiation if this is an array section and \p MapType indicates
9842 // that memory allocation is required.
9843 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9844 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9845 MapType, MapName, ElementSize, HeadBB,
9846 /*IsInit=*/true);
9847
9848 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9849
9850 // Emit the loop header block.
9851 emitBlock(HeadBB, MapperFn);
9852 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9853 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9854 // Evaluate whether the initial condition is satisfied.
9855 Value *IsEmpty =
9856 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9857 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9858
9859 // Emit the loop body block.
9860 emitBlock(BodyBB, MapperFn);
9861 BasicBlock *LastBB = BodyBB;
9862 PHINode *PtrPHI =
9863 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9864 PtrPHI->addIncoming(PtrBegin, HeadBB);
9865
9866 // Get map clause information. Fill up the arrays with all mapped variables.
9867 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9868 if (!Info)
9869 return Info.takeError();
9870
9871 // Call the runtime API __tgt_mapper_num_components to get the number of
9872 // pre-existing components.
9873 Value *OffloadingArgs[] = {MapperHandle};
9874 Value *PreviousSize = createRuntimeFunctionCall(
9875 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9876 OffloadingArgs);
9877 Value *ShiftedPreviousSize =
9878 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9879
9880 // Fill up the runtime mapper handle for all components.
9881 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9882 Value *CurBaseArg = Info->BasePointers[I];
9883 Value *CurBeginArg = Info->Pointers[I];
9884 Value *CurSizeArg = Info->Sizes[I];
9885 Value *CurNameArg = Info->Names.size()
9886 ? Info->Names[I]
9887 : Constant::getNullValue(Builder.getPtrTy());
9888
9889 // Extract the MEMBER_OF field from the map type.
9890 Value *OriMapType = Builder.getInt64(
9891 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9892 Info->Types[I]));
9893 Value *MemberMapType =
9894 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9895
9896 // Combine the map type inherited from user-defined mapper with that
9897 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9898 // bits of the \a MapType, which is the input argument of the mapper
9899 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9900 // bits of MemberMapType.
9901 // [OpenMP 5.0], 1.2.6. map-type decay.
9902 // | alloc | to | from | tofrom | release | delete
9903 // ----------------------------------------------------------
9904 // alloc | alloc | alloc | alloc | alloc | release | delete
9905 // to | alloc | to | alloc | to | release | delete
9906 // from | alloc | alloc | from | from | release | delete
9907 // tofrom | alloc | to | from | tofrom | release | delete
9908 Value *LeftToFrom = Builder.CreateAnd(
9909 MapType,
9910 Builder.getInt64(
9911 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9912 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9913 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9914 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9915 BasicBlock *AllocElseBB =
9916 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9917 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9918 BasicBlock *ToElseBB =
9919 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9920 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9921 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9922 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9923 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9924 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9925 emitBlock(AllocBB, MapperFn);
9926 Value *AllocMapType = Builder.CreateAnd(
9927 MemberMapType,
9928 Builder.getInt64(
9929 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9930 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9931 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9932 Builder.CreateBr(EndBB);
9933 emitBlock(AllocElseBB, MapperFn);
9934 Value *IsTo = Builder.CreateICmpEQ(
9935 LeftToFrom,
9936 Builder.getInt64(
9937 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9938 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9939 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9940 // In case of to, clear OMP_MAP_FROM.
9941 emitBlock(ToBB, MapperFn);
9942 Value *ToMapType = Builder.CreateAnd(
9943 MemberMapType,
9944 Builder.getInt64(
9945 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9946 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9947 Builder.CreateBr(EndBB);
9948 emitBlock(ToElseBB, MapperFn);
9949 Value *IsFrom = Builder.CreateICmpEQ(
9950 LeftToFrom,
9951 Builder.getInt64(
9952 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9953 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9954 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9955 // In case of from, clear OMP_MAP_TO.
9956 emitBlock(FromBB, MapperFn);
9957 Value *FromMapType = Builder.CreateAnd(
9958 MemberMapType,
9959 Builder.getInt64(
9960 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9961 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9962 // In case of tofrom, do nothing.
9963 emitBlock(EndBB, MapperFn);
9964 LastBB = EndBB;
9965 PHINode *CurMapType =
9966 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9967 CurMapType->addIncoming(AllocMapType, AllocBB);
9968 CurMapType->addIncoming(ToMapType, ToBB);
9969 CurMapType->addIncoming(FromMapType, FromBB);
9970 CurMapType->addIncoming(MemberMapType, ToElseBB);
9971
9972 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9973 CurSizeArg, CurMapType, CurNameArg};
9974
9975 auto ChildMapperFn = CustomMapperCB(I);
9976 if (!ChildMapperFn)
9977 return ChildMapperFn.takeError();
9978 if (*ChildMapperFn) {
9979 // Call the corresponding mapper function.
9980 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9981 ->setDoesNotThrow();
9982 } else {
9983 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9984 // data structure.
9986 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9987 OffloadingArgs);
9988 }
9989 }
9990
9991 // Update the pointer to point to the next element that needs to be mapped,
9992 // and check whether we have mapped all elements.
9993 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9994 "omp.arraymap.next");
9995 PtrPHI->addIncoming(PtrNext, LastBB);
9996 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9997 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9998 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9999
10000 emitBlock(ExitBB, MapperFn);
10001 // Emit array deletion if this is an array section and \p MapType indicates
10002 // that deletion is required.
10003 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10004 MapType, MapName, ElementSize, DoneBB,
10005 /*IsInit=*/false);
10006
10007 // Emit the function exit block.
10008 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10009
10010 Builder.CreateRetVoid();
10011 Builder.restoreIP(SavedIP);
10012 return MapperFn;
10013}
10014
10016 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10017 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10018 bool IsNonContiguous,
10019 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10020
10021 // Reset the array information.
10022 Info.clearArrayInfo();
10023 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10024
10025 if (Info.NumberOfPtrs == 0)
10026 return Error::success();
10027
10028 Builder.restoreIP(AllocaIP);
10029 // Detect if we have any capture size requiring runtime evaluation of the
10030 // size so that a constant array could be eventually used.
10031 ArrayType *PointerArrayType =
10032 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10033
10034 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10035 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10036
10037 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10038 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10039 AllocaInst *MappersArray = Builder.CreateAlloca(
10040 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10041 Info.RTArgs.MappersArray = MappersArray;
10042
10043 // If we don't have any VLA types or other types that require runtime
10044 // evaluation, we can use a constant array for the map sizes, otherwise we
10045 // need to fill up the arrays as we do for the pointers.
10046 Type *Int64Ty = Builder.getInt64Ty();
10047 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10048 ConstantInt::get(Int64Ty, 0));
10049 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10050 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10051 bool IsNonContigEntry =
10052 IsNonContiguous &&
10053 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10054 CombinedInfo.Types[I] &
10055 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10056 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10057 // descriptor_dim records), not the byte size.
10058 if (IsNonContigEntry) {
10059 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10060 "Index must be in-bounds for NON_CONTIG Dims array");
10061 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10062 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10063 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10064 continue;
10065 }
10066 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10067 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10068 ConstSizes[I] = CI;
10069 continue;
10070 }
10071 }
10072 RuntimeSizes.set(I);
10073 }
10074
10075 if (RuntimeSizes.all()) {
10076 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10077 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10078 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10079 restoreIPandDebugLoc(Builder, CodeGenIP);
10080 } else {
10081 auto *SizesArrayInit = ConstantArray::get(
10082 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10083 std::string Name = createPlatformSpecificName({"offload_sizes"});
10084 auto *SizesArrayGbl =
10085 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10086 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10087 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10088
10089 if (!RuntimeSizes.any()) {
10090 Info.RTArgs.SizesArray = SizesArrayGbl;
10091 } else {
10092 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10093 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10094 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10095 AllocaInst *Buffer = Builder.CreateAlloca(
10096 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10097 Buffer->setAlignment(OffloadSizeAlign);
10098 restoreIPandDebugLoc(Builder, CodeGenIP);
10099 Builder.CreateMemCpy(
10100 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10101 SizesArrayGbl, OffloadSizeAlign,
10102 Builder.getIntN(
10103 IndexSize,
10104 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10105
10106 Info.RTArgs.SizesArray = Buffer;
10107 }
10108 restoreIPandDebugLoc(Builder, CodeGenIP);
10109 }
10110
10111 // The map types are always constant so we don't need to generate code to
10112 // fill arrays. Instead, we create an array constant.
10114 for (auto mapFlag : CombinedInfo.Types)
10115 Mapping.push_back(
10116 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10117 mapFlag));
10118 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10119 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10120 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10121
10122 // The information types are only built if provided.
10123 if (!CombinedInfo.Names.empty()) {
10124 auto *MapNamesArrayGbl = createOffloadMapnames(
10125 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10126 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10127 Info.EmitDebug = true;
10128 } else {
10129 Info.RTArgs.MapNamesArray =
10131 Info.EmitDebug = false;
10132 }
10133
10134 // If there's a present map type modifier, it must not be applied to the end
10135 // of a region, so generate a separate map type array in that case.
10136 if (Info.separateBeginEndCalls()) {
10137 bool EndMapTypesDiffer = false;
10138 for (uint64_t &Type : Mapping) {
10139 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10140 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10141 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10142 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10143 EndMapTypesDiffer = true;
10144 }
10145 }
10146 if (EndMapTypesDiffer) {
10147 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10148 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10149 }
10150 }
10151
10152 PointerType *PtrTy = Builder.getPtrTy();
10153 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10154 Value *BPVal = CombinedInfo.BasePointers[I];
10155 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10156 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10157 0, I);
10158 Builder.CreateAlignedStore(BPVal, BP,
10159 M.getDataLayout().getPrefTypeAlign(PtrTy));
10160
10161 if (Info.requiresDevicePointerInfo()) {
10162 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10163 CodeGenIP = Builder.saveIP();
10164 Builder.restoreIP(AllocaIP);
10165 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10166 Builder.restoreIP(CodeGenIP);
10167 if (DeviceAddrCB)
10168 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10169 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10170 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10171 if (DeviceAddrCB)
10172 DeviceAddrCB(I, BP);
10173 }
10174 }
10175
10176 Value *PVal = CombinedInfo.Pointers[I];
10177 Value *P = Builder.CreateConstInBoundsGEP2_32(
10178 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10179 I);
10180 // TODO: Check alignment correct.
10181 Builder.CreateAlignedStore(PVal, P,
10182 M.getDataLayout().getPrefTypeAlign(PtrTy));
10183
10184 if (RuntimeSizes.test(I)) {
10185 Value *S = Builder.CreateConstInBoundsGEP2_32(
10186 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10187 /*Idx0=*/0,
10188 /*Idx1=*/I);
10189 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10190 Int64Ty,
10191 /*isSigned=*/true),
10192 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10193 }
10194 // Fill up the mapper array.
10195 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10196 Value *MFunc = ConstantPointerNull::get(PtrTy);
10197
10198 auto CustomMFunc = CustomMapperCB(I);
10199 if (!CustomMFunc)
10200 return CustomMFunc.takeError();
10201 if (*CustomMFunc)
10202 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10203
10204 Value *MAddr = Builder.CreateInBoundsGEP(
10205 PointerArrayType, MappersArray,
10206 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10207 Builder.CreateAlignedStore(
10208 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10209 }
10210
10211 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10212 Info.NumberOfPtrs == 0)
10213 return Error::success();
10214 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10215 return Error::success();
10216}
10217
10219 BasicBlock *CurBB = Builder.GetInsertBlock();
10220
10221 if (!CurBB || CurBB->hasTerminator()) {
10222 // If there is no insert point or the previous block is already
10223 // terminated, don't touch it.
10224 } else {
10225 // Otherwise, create a fall-through branch.
10226 Builder.CreateBr(Target);
10227 }
10228
10229 Builder.ClearInsertionPoint();
10230}
10231
10233 bool IsFinished) {
10234 BasicBlock *CurBB = Builder.GetInsertBlock();
10235
10236 // Fall out of the current block (if necessary).
10237 emitBranch(BB);
10238
10239 if (IsFinished && BB->use_empty()) {
10240 BB->eraseFromParent();
10241 return;
10242 }
10243
10244 // Place the block after the current block, if possible, or else at
10245 // the end of the function.
10246 if (CurBB && CurBB->getParent())
10247 CurFn->insert(std::next(CurBB->getIterator()), BB);
10248 else
10249 CurFn->insert(CurFn->end(), BB);
10250 Builder.SetInsertPoint(BB);
10251}
10252
10254 BodyGenCallbackTy ElseGen,
10255 InsertPointTy AllocaIP) {
10256 // If the condition constant folds and can be elided, try to avoid emitting
10257 // the condition and the dead arm of the if/else.
10258 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10259 auto CondConstant = CI->getSExtValue();
10260 if (CondConstant)
10261 return ThenGen(AllocaIP, Builder.saveIP());
10262
10263 return ElseGen(AllocaIP, Builder.saveIP());
10264 }
10265
10266 Function *CurFn = Builder.GetInsertBlock()->getParent();
10267
10268 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10269 // emit the conditional branch.
10270 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10271 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10272 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10273 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10274 // Emit the 'then' code.
10275 emitBlock(ThenBlock, CurFn);
10276 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10277 return Err;
10278 emitBranch(ContBlock);
10279 // Emit the 'else' code if present.
10280 // There is no need to emit line number for unconditional branch.
10281 emitBlock(ElseBlock, CurFn);
10282 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10283 return Err;
10284 // There is no need to emit line number for unconditional branch.
10285 emitBranch(ContBlock);
10286 // Emit the continuation block for code after the if.
10287 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10288 return Error::success();
10289}
10290
10291bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10292 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10295 "Unexpected Atomic Ordering.");
10296
10297 bool Flush = false;
10299
10300 switch (AK) {
10301 case Read:
10304 FlushAO = AtomicOrdering::Acquire;
10305 Flush = true;
10306 }
10307 break;
10308 case Write:
10309 case Compare:
10310 case Update:
10313 FlushAO = AtomicOrdering::Release;
10314 Flush = true;
10315 }
10316 break;
10317 case Capture:
10318 switch (AO) {
10320 FlushAO = AtomicOrdering::Acquire;
10321 Flush = true;
10322 break;
10324 FlushAO = AtomicOrdering::Release;
10325 Flush = true;
10326 break;
10330 Flush = true;
10331 break;
10332 default:
10333 // do nothing - leave silently.
10334 break;
10335 }
10336 }
10337
10338 if (Flush) {
10339 // Currently Flush RT call still doesn't take memory_ordering, so for when
10340 // that happens, this tries to do the resolution of which atomic ordering
10341 // to use with but issue the flush call
10342 // TODO: pass `FlushAO` after memory ordering support is added
10343 (void)FlushAO;
10344 emitFlush(Loc);
10345 }
10346
10347 // for AO == AtomicOrdering::Monotonic and all other case combinations
10348 // do nothing
10349 return Flush;
10350}
10351
10355 AtomicOrdering AO, InsertPointTy AllocaIP) {
10356 if (!updateToLocation(Loc))
10357 return Loc.IP;
10358
10359 assert(X.Var->getType()->isPointerTy() &&
10360 "OMP Atomic expects a pointer to target memory");
10361 Type *XElemTy = X.ElemTy;
10362 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10363 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10364 "OMP atomic read expected a scalar type");
10365
10366 Value *XRead = nullptr;
10367
10368 if (XElemTy->isIntegerTy()) {
10369 LoadInst *XLD =
10370 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10371 XLD->setAtomic(AO);
10372 XRead = cast<Value>(XLD);
10373 } else if (XElemTy->isStructTy()) {
10374 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10375 // target does not support `atomicrmw` of the size of the struct
10376 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10377 OldVal->setAtomic(AO);
10378 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10379 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10380 OpenMPIRBuilder::AtomicInfo atomicInfo(
10381 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10382 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10383 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10384 XRead = AtomicLoadRes.first;
10385 OldVal->eraseFromParent();
10386 } else {
10387 // We need to perform atomic op as integer
10388 IntegerType *IntCastTy =
10389 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10390 LoadInst *XLoad =
10391 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10392 XLoad->setAtomic(AO);
10393 if (XElemTy->isFloatingPointTy()) {
10394 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10395 } else {
10396 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10397 }
10398 }
10399 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10400 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10401 return Builder.saveIP();
10402}
10403
10406 AtomicOpValue &X, Value *Expr,
10407 AtomicOrdering AO, InsertPointTy AllocaIP) {
10408 if (!updateToLocation(Loc))
10409 return Loc.IP;
10410
10411 assert(X.Var->getType()->isPointerTy() &&
10412 "OMP Atomic expects a pointer to target memory");
10413 Type *XElemTy = X.ElemTy;
10414 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10415 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10416 "OMP atomic write expected a scalar type");
10417
10418 if (XElemTy->isIntegerTy()) {
10419 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10420 XSt->setAtomic(AO);
10421 } else if (XElemTy->isStructTy()) {
10422 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10423 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10424 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10425 OpenMPIRBuilder::AtomicInfo atomicInfo(
10426 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10427 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10428 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10429 OldVal->eraseFromParent();
10430 } else {
10431 // We need to bitcast and perform atomic op as integers
10432 IntegerType *IntCastTy =
10433 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10434 Value *ExprCast =
10435 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10436 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10437 XSt->setAtomic(AO);
10438 }
10439
10440 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10441 return Builder.saveIP();
10442}
10443
10446 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10447 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10448 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10449 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10450 if (!updateToLocation(Loc))
10451 return Loc.IP;
10452
10453 LLVM_DEBUG({
10454 Type *XTy = X.Var->getType();
10455 assert(XTy->isPointerTy() &&
10456 "OMP Atomic expects a pointer to target memory");
10457 Type *XElemTy = X.ElemTy;
10458 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10459 XElemTy->isPointerTy()) &&
10460 "OMP atomic update expected a scalar type");
10461 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10462 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10463 "OpenMP atomic does not support LT or GT operations");
10464 });
10465
10466 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10467 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10468 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10469 if (!AtomicResult)
10470 return AtomicResult.takeError();
10471 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10472 return Builder.saveIP();
10473}
10474
10475// FIXME: Duplicating AtomicExpand
10476Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10477 AtomicRMWInst::BinOp RMWOp) {
10478 switch (RMWOp) {
10479 case AtomicRMWInst::Add:
10480 return Builder.CreateAdd(Src1, Src2);
10481 case AtomicRMWInst::Sub:
10482 return Builder.CreateSub(Src1, Src2);
10483 case AtomicRMWInst::And:
10484 return Builder.CreateAnd(Src1, Src2);
10486 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10487 case AtomicRMWInst::Or:
10488 return Builder.CreateOr(Src1, Src2);
10489 case AtomicRMWInst::Xor:
10490 return Builder.CreateXor(Src1, Src2);
10495 case AtomicRMWInst::Max:
10496 case AtomicRMWInst::Min:
10509 llvm_unreachable("Unsupported atomic update operation");
10510 }
10511 llvm_unreachable("Unsupported atomic update operation");
10512}
10513
10514Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10515 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10517 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10518 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10519 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10520 // or a complex datatype.
10521 bool emitRMWOp = false;
10522 switch (RMWOp) {
10523 case AtomicRMWInst::Add:
10524 case AtomicRMWInst::And:
10526 case AtomicRMWInst::Or:
10527 case AtomicRMWInst::Xor:
10529 emitRMWOp = XElemTy;
10530 break;
10531 case AtomicRMWInst::Sub:
10532 emitRMWOp = (IsXBinopExpr && XElemTy);
10533 break;
10534 default:
10535 emitRMWOp = false;
10536 }
10537 emitRMWOp &= XElemTy->isIntegerTy();
10538
10539 std::pair<Value *, Value *> Res;
10540 if (emitRMWOp) {
10541 AtomicRMWInst *RMWInst =
10542 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10543 if (T.isAMDGPU()) {
10544 if (IsIgnoreDenormalMode)
10545 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10546 llvm::MDNode::get(Builder.getContext(), {}));
10547 if (!IsFineGrainedMemory)
10548 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10549 llvm::MDNode::get(Builder.getContext(), {}));
10550 if (!IsRemoteMemory)
10551 RMWInst->setMetadata("amdgpu.no.remote.memory",
10552 llvm::MDNode::get(Builder.getContext(), {}));
10553 }
10554 Res.first = RMWInst;
10555 // not needed except in case of postfix captures. Generate anyway for
10556 // consistency with the else part. Will be removed with any DCE pass.
10557 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10558 if (RMWOp == AtomicRMWInst::Xchg)
10559 Res.second = Res.first;
10560 else
10561 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10562 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10563 XElemTy->isStructTy()) {
10564 LoadInst *OldVal =
10565 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10566 OldVal->setAtomic(AO);
10567 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10568 unsigned LoadSize =
10569 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10570
10571 OpenMPIRBuilder::AtomicInfo atomicInfo(
10572 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10573 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10574 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10575 BasicBlock *CurBB = Builder.GetInsertBlock();
10576 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10577 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10578 BasicBlock *ExitBB =
10579 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10580 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10581 X->getName() + ".atomic.cont");
10582 ContBB->getTerminator()->eraseFromParent();
10583 Builder.restoreIP(AllocaIP);
10584 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10585 NewAtomicAddr->setName(X->getName() + "x.new.val");
10586 Builder.SetInsertPoint(ContBB);
10587 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10588 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10589 Value *OldExprVal = PHI;
10590 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10591 if (!CBResult)
10592 return CBResult.takeError();
10593 Value *Upd = *CBResult;
10594 Builder.CreateStore(Upd, NewAtomicAddr);
10597 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10598 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10599 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10600 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10601 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10602 OldVal->eraseFromParent();
10603 Res.first = OldExprVal;
10604 Res.second = Upd;
10605
10606 if (UnreachableInst *ExitTI =
10608 CurBBTI->eraseFromParent();
10609 Builder.SetInsertPoint(ExitBB);
10610 } else {
10611 Builder.SetInsertPoint(ExitTI);
10612 }
10613 } else {
10614 IntegerType *IntCastTy =
10615 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10616 LoadInst *OldVal =
10617 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10618 OldVal->setAtomic(AO);
10619 // CurBB
10620 // | /---\
10621 // ContBB |
10622 // | \---/
10623 // ExitBB
10624 BasicBlock *CurBB = Builder.GetInsertBlock();
10625 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10626 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10627 BasicBlock *ExitBB =
10628 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10629 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10630 X->getName() + ".atomic.cont");
10631 ContBB->getTerminator()->eraseFromParent();
10632 Builder.restoreIP(AllocaIP);
10633 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10634 NewAtomicAddr->setName(X->getName() + "x.new.val");
10635 Builder.SetInsertPoint(ContBB);
10636 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10637 PHI->addIncoming(OldVal, CurBB);
10638 bool IsIntTy = XElemTy->isIntegerTy();
10639 Value *OldExprVal = PHI;
10640 if (!IsIntTy) {
10641 if (XElemTy->isFloatingPointTy()) {
10642 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10643 X->getName() + ".atomic.fltCast");
10644 } else {
10645 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10646 X->getName() + ".atomic.ptrCast");
10647 }
10648 }
10649
10650 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10651 if (!CBResult)
10652 return CBResult.takeError();
10653 Value *Upd = *CBResult;
10654 Builder.CreateStore(Upd, NewAtomicAddr);
10655 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10658 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10659 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10660 Result->setVolatile(VolatileX);
10661 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10662 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10663 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10664 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10665
10666 Res.first = OldExprVal;
10667 Res.second = Upd;
10668
10669 // set Insertion point in exit block
10670 if (UnreachableInst *ExitTI =
10672 CurBBTI->eraseFromParent();
10673 Builder.SetInsertPoint(ExitBB);
10674 } else {
10675 Builder.SetInsertPoint(ExitTI);
10676 }
10677 }
10678
10679 return Res;
10680}
10681
10684 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10685 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10686 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10687 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10688 if (!updateToLocation(Loc))
10689 return Loc.IP;
10690
10691 LLVM_DEBUG({
10692 Type *XTy = X.Var->getType();
10693 assert(XTy->isPointerTy() &&
10694 "OMP Atomic expects a pointer to target memory");
10695 Type *XElemTy = X.ElemTy;
10696 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10697 XElemTy->isPointerTy()) &&
10698 "OMP atomic capture expected a scalar type");
10699 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10700 "OpenMP atomic does not support LT or GT operations");
10701 });
10702
10703 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10704 // 'x' is simply atomically rewritten with 'expr'.
10705 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10706 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10707 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10708 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10709 if (!AtomicResult)
10710 return AtomicResult.takeError();
10711 Value *CapturedVal =
10712 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10713 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10714
10715 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10716 return Builder.saveIP();
10717}
10718
10722 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10723 bool IsFailOnly) {
10724
10726 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10727 IsPostfixUpdate, IsFailOnly, Failure);
10728}
10729
10733 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10734 bool IsFailOnly, AtomicOrdering Failure) {
10735
10736 if (!updateToLocation(Loc))
10737 return Loc.IP;
10738
10739 assert(X.Var->getType()->isPointerTy() &&
10740 "OMP atomic expects a pointer to target memory");
10741 // compare capture
10742 if (V.Var) {
10743 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10744 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10745 }
10746
10747 bool IsInteger = E->getType()->isIntegerTy();
10748
10749 if (Op == OMPAtomicCompareOp::EQ) {
10750 AtomicCmpXchgInst *Result = nullptr;
10751 if (!IsInteger) {
10752 IntegerType *IntCastTy =
10753 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10754 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10755 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10756 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10757 AO, Failure);
10758 } else {
10759 Result =
10760 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10761 }
10762
10763 if (V.Var) {
10764 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10765 if (!IsInteger)
10766 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10767 assert(OldValue->getType() == V.ElemTy &&
10768 "OldValue and V must be of same type");
10769 if (IsPostfixUpdate) {
10770 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10771 } else {
10772 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10773 if (IsFailOnly) {
10774 // CurBB----
10775 // | |
10776 // v |
10777 // ContBB |
10778 // | |
10779 // v |
10780 // ExitBB <-
10781 //
10782 // where ContBB only contains the store of old value to 'v'.
10783 BasicBlock *CurBB = Builder.GetInsertBlock();
10784 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10785 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10786 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10787 CurBBTI, X.Var->getName() + ".atomic.exit");
10788 BasicBlock *ContBB = CurBB->splitBasicBlock(
10789 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10790 ContBB->getTerminator()->eraseFromParent();
10791 CurBB->getTerminator()->eraseFromParent();
10792
10793 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10794
10795 Builder.SetInsertPoint(ContBB);
10796 Builder.CreateStore(OldValue, V.Var);
10797 Builder.CreateBr(ExitBB);
10798
10799 if (UnreachableInst *ExitTI =
10801 CurBBTI->eraseFromParent();
10802 Builder.SetInsertPoint(ExitBB);
10803 } else {
10804 Builder.SetInsertPoint(ExitTI);
10805 }
10806 } else {
10807 Value *CapturedValue =
10808 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10809 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10810 }
10811 }
10812 }
10813 // The comparison result has to be stored.
10814 if (R.Var) {
10815 assert(R.Var->getType()->isPointerTy() &&
10816 "r.var must be of pointer type");
10817 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10818
10819 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10820 Value *ResultCast = R.IsSigned
10821 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10822 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10823 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10824 }
10825 } else {
10826 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10827 "Op should be either max or min at this point");
10828 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10829
10830 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10831 // Let's take max as example.
10832 // OpenMP form:
10833 // x = x > expr ? expr : x;
10834 // LLVM form:
10835 // *ptr = *ptr > val ? *ptr : val;
10836 // We need to transform to LLVM form.
10837 // x = x <= expr ? x : expr;
10839 if (IsXBinopExpr) {
10840 if (IsInteger) {
10841 if (X.IsSigned)
10842 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10844 else
10845 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10847 } else {
10848 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10850 }
10851 } else {
10852 if (IsInteger) {
10853 if (X.IsSigned)
10854 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10856 else
10857 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10859 } else {
10860 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10862 }
10863 }
10864
10865 AtomicRMWInst *OldValue =
10866 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10867 if (V.Var) {
10868 Value *CapturedValue = nullptr;
10869 if (IsPostfixUpdate) {
10870 CapturedValue = OldValue;
10871 } else {
10872 CmpInst::Predicate Pred;
10873 switch (NewOp) {
10874 case AtomicRMWInst::Max:
10875 Pred = CmpInst::ICMP_SGT;
10876 break;
10878 Pred = CmpInst::ICMP_UGT;
10879 break;
10881 Pred = CmpInst::FCMP_OGT;
10882 break;
10883 case AtomicRMWInst::Min:
10884 Pred = CmpInst::ICMP_SLT;
10885 break;
10887 Pred = CmpInst::ICMP_ULT;
10888 break;
10890 Pred = CmpInst::FCMP_OLT;
10891 break;
10892 default:
10893 llvm_unreachable("unexpected comparison op");
10894 }
10895 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10896 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10897 }
10898 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10899 }
10900 }
10901
10902 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10903
10904 return Builder.saveIP();
10905}
10906
10909 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10910 Value *NumTeamsUpper, Value *ThreadLimit,
10911 Value *IfExpr) {
10912 if (!updateToLocation(Loc))
10913 return InsertPointTy();
10914
10915 uint32_t SrcLocStrSize;
10916 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10917 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10918 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10919
10920 // Outer allocation basicblock is the entry block of the current function.
10921 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10922 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10923 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10924 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10925 }
10926
10927 // The current basic block is split into four basic blocks. After outlining,
10928 // they will be mapped as follows:
10929 // ```
10930 // def current_fn() {
10931 // current_basic_block:
10932 // br label %teams.exit
10933 // teams.exit:
10934 // ; instructions after teams
10935 // }
10936 //
10937 // def outlined_fn() {
10938 // teams.alloca:
10939 // br label %teams.body
10940 // teams.body:
10941 // ; instructions within teams body
10942 // }
10943 // ```
10944 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10945 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10946 BasicBlock *AllocaBB =
10947 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10948
10949 bool SubClausesPresent =
10950 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10951 // Push num_teams
10952 if (!Config.isTargetDevice() && SubClausesPresent) {
10953 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10954 "if lowerbound is non-null, then upperbound must also be non-null "
10955 "for bounds on num_teams");
10956
10957 if (NumTeamsUpper == nullptr)
10958 NumTeamsUpper = Builder.getInt32(0);
10959
10960 if (NumTeamsLower == nullptr)
10961 NumTeamsLower = NumTeamsUpper;
10962
10963 if (IfExpr) {
10964 assert(IfExpr->getType()->isIntegerTy() &&
10965 "argument to if clause must be an integer value");
10966
10967 // upper = ifexpr ? upper : 1
10968 if (IfExpr->getType() != Int1)
10969 IfExpr = Builder.CreateICmpNE(IfExpr,
10970 ConstantInt::get(IfExpr->getType(), 0));
10971 NumTeamsUpper = Builder.CreateSelect(
10972 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10973
10974 // lower = ifexpr ? lower : 1
10975 NumTeamsLower = Builder.CreateSelect(
10976 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10977 }
10978
10979 if (ThreadLimit == nullptr)
10980 ThreadLimit = Builder.getInt32(0);
10981
10982 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
10983 // truncate or sign extend the passed values to match the int32 parameters.
10984 Value *NumTeamsLowerInt32 =
10985 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
10986 Value *NumTeamsUpperInt32 =
10987 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
10988 Value *ThreadLimitInt32 =
10989 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
10990
10991 Value *ThreadNum = getOrCreateThreadID(Ident);
10992
10994 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10995 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
10996 ThreadLimitInt32});
10997 }
10998 // Generate the body of teams.
10999 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11000 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11001 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11002 return Err;
11003
11004 OutlineInfo OI;
11005 OI.EntryBB = AllocaBB;
11006 OI.ExitBB = ExitBB;
11007 OI.OuterAllocaBB = &OuterAllocaBB;
11008
11009 // Insert fake values for global tid and bound tid.
11011 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11013 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11015 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11016
11017 auto HostPostOutlineCB = [this, Ident,
11018 ToBeDeleted](Function &OutlinedFn) mutable {
11019 // The stale call instruction will be replaced with a new call instruction
11020 // for runtime call with the outlined function.
11021
11022 assert(OutlinedFn.hasOneUse() &&
11023 "there must be a single user for the outlined function");
11024 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11025 ToBeDeleted.push_back(StaleCI);
11026
11027 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11028 "Outlined function must have two or three arguments only");
11029
11030 bool HasShared = OutlinedFn.arg_size() == 3;
11031
11032 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11033 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11034 if (HasShared)
11035 OutlinedFn.getArg(2)->setName("data");
11036
11037 // Call to the runtime function for teams in the current function.
11038 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11039 "outlined function.");
11040 Builder.SetInsertPoint(StaleCI);
11041 SmallVector<Value *> Args = {
11042 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11043 if (HasShared)
11044 Args.push_back(StaleCI->getArgOperand(2));
11047 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11048 Args);
11049
11050 for (Instruction *I : llvm::reverse(ToBeDeleted))
11051 I->eraseFromParent();
11052 };
11053
11054 if (!Config.isTargetDevice())
11055 OI.PostOutlineCB = HostPostOutlineCB;
11056
11057 addOutlineInfo(std::move(OI));
11058
11059 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11060
11061 return Builder.saveIP();
11062}
11063
11066 InsertPointTy OuterAllocaIP,
11067 BodyGenCallbackTy BodyGenCB) {
11068 if (!updateToLocation(Loc))
11069 return InsertPointTy();
11070
11071 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
11072
11073 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11074 BasicBlock *BodyBB =
11075 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11076 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11077 }
11078 BasicBlock *ExitBB =
11079 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11080 BasicBlock *BodyBB =
11081 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11082 BasicBlock *AllocaBB =
11083 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11084
11085 // Generate the body of distribute clause
11086 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11087 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11088 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11089 return Err;
11090
11091 // When using target we use different runtime functions which require a
11092 // callback.
11093 if (Config.isTargetDevice()) {
11094 OutlineInfo OI;
11095 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
11096 OI.EntryBB = AllocaBB;
11097 OI.ExitBB = ExitBB;
11098
11099 addOutlineInfo(std::move(OI));
11100 }
11101 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11102
11103 return Builder.saveIP();
11104}
11105
11108 std::string VarName) {
11109 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11111 Names.size()),
11112 Names);
11113 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11114 M, MapNamesArrayInit->getType(),
11115 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11116 VarName);
11117 return MapNamesArrayGlobal;
11118}
11119
11120// Create all simple and struct types exposed by the runtime and remember
11121// the llvm::PointerTypes of them for easy access later.
11122void OpenMPIRBuilder::initializeTypes(Module &M) {
11123 LLVMContext &Ctx = M.getContext();
11124 StructType *T;
11125 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11126 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11127#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11128#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11129 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11130 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11131#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11132 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11133 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11134#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11135 T = StructType::getTypeByName(Ctx, StructName); \
11136 if (!T) \
11137 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11138 VarName = T; \
11139 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11140#include "llvm/Frontend/OpenMP/OMPKinds.def"
11141}
11142
11145 SmallVectorImpl<BasicBlock *> &BlockVector) {
11147 BlockSet.insert(EntryBB);
11148 BlockSet.insert(ExitBB);
11149
11150 Worklist.push_back(EntryBB);
11151 while (!Worklist.empty()) {
11152 BasicBlock *BB = Worklist.pop_back_val();
11153 BlockVector.push_back(BB);
11154 for (BasicBlock *SuccBB : successors(BB))
11155 if (BlockSet.insert(SuccBB).second)
11156 Worklist.push_back(SuccBB);
11157 }
11158}
11159
11161 uint64_t Size, int32_t Flags,
11163 StringRef Name) {
11164 if (!Config.isGPU()) {
11167 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11168 return;
11169 }
11170 // TODO: Add support for global variables on the device after declare target
11171 // support.
11172 Function *Fn = dyn_cast<Function>(Addr);
11173 if (!Fn)
11174 return;
11175
11176 // Add a function attribute for the kernel.
11177 Fn->addFnAttr("kernel");
11178 if (T.isAMDGCN())
11179 Fn->addFnAttr("uniform-work-group-size");
11180 Fn->addFnAttr(Attribute::MustProgress);
11181}
11182
11183// We only generate metadata for function that contain target regions.
11186
11187 // If there are no entries, we don't need to do anything.
11188 if (OffloadInfoManager.empty())
11189 return;
11190
11191 LLVMContext &C = M.getContext();
11194 16>
11195 OrderedEntries(OffloadInfoManager.size());
11196
11197 // Auxiliary methods to create metadata values and strings.
11198 auto &&GetMDInt = [this](unsigned V) {
11199 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11200 };
11201
11202 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11203
11204 // Create the offloading info metadata node.
11205 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11206 auto &&TargetRegionMetadataEmitter =
11207 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11208 const TargetRegionEntryInfo &EntryInfo,
11210 // Generate metadata for target regions. Each entry of this metadata
11211 // contains:
11212 // - Entry 0 -> Kind of this type of metadata (0).
11213 // - Entry 1 -> Device ID of the file where the entry was identified.
11214 // - Entry 2 -> File ID of the file where the entry was identified.
11215 // - Entry 3 -> Mangled name of the function where the entry was
11216 // identified.
11217 // - Entry 4 -> Line in the file where the entry was identified.
11218 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11219 // - Entry 6 -> Order the entry was created.
11220 // The first element of the metadata node is the kind.
11221 Metadata *Ops[] = {
11222 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11223 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11224 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11225 GetMDInt(E.getOrder())};
11226
11227 // Save this entry in the right position of the ordered entries array.
11228 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11229
11230 // Add metadata to the named metadata node.
11231 MD->addOperand(MDNode::get(C, Ops));
11232 };
11233
11234 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11235
11236 // Create function that emits metadata for each device global variable entry;
11237 auto &&DeviceGlobalVarMetadataEmitter =
11238 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11239 StringRef MangledName,
11241 // Generate metadata for global variables. Each entry of this metadata
11242 // contains:
11243 // - Entry 0 -> Kind of this type of metadata (1).
11244 // - Entry 1 -> Mangled name of the variable.
11245 // - Entry 2 -> Declare target kind.
11246 // - Entry 3 -> Order the entry was created.
11247 // The first element of the metadata node is the kind.
11248 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11249 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11250
11251 // Save this entry in the right position of the ordered entries array.
11252 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11253 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11254
11255 // Add metadata to the named metadata node.
11256 MD->addOperand(MDNode::get(C, Ops));
11257 };
11258
11259 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11260 DeviceGlobalVarMetadataEmitter);
11261
11262 for (const auto &E : OrderedEntries) {
11263 assert(E.first && "All ordered entries must exist!");
11264 if (const auto *CE =
11266 E.first)) {
11267 if (!CE->getID() || !CE->getAddress()) {
11268 // Do not blame the entry if the parent funtion is not emitted.
11269 TargetRegionEntryInfo EntryInfo = E.second;
11270 StringRef FnName = EntryInfo.ParentName;
11271 if (!M.getNamedValue(FnName))
11272 continue;
11273 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11274 continue;
11275 }
11276 createOffloadEntry(CE->getID(), CE->getAddress(),
11277 /*Size=*/0, CE->getFlags(),
11279 } else if (const auto *CE = dyn_cast<
11281 E.first)) {
11284 CE->getFlags());
11285 switch (Flags) {
11288 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11289 continue;
11290 if (!CE->getAddress()) {
11291 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11292 continue;
11293 }
11294 // The vaiable has no definition - no need to add the entry.
11295 if (CE->getVarSize() == 0)
11296 continue;
11297 break;
11299 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11300 (!Config.isTargetDevice() && CE->getAddress())) &&
11301 "Declaret target link address is set.");
11302 if (Config.isTargetDevice())
11303 continue;
11304 if (!CE->getAddress()) {
11306 continue;
11307 }
11308 break;
11311 if (!CE->getAddress()) {
11312 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11313 continue;
11314 }
11315 break;
11316 default:
11317 break;
11318 }
11319
11320 // Hidden or internal symbols on the device are not externally visible.
11321 // We should not attempt to register them by creating an offloading
11322 // entry. Indirect variables are handled separately on the device.
11323 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11324 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11325 (Flags !=
11327 Flags != OffloadEntriesInfoManager::
11328 OMPTargetGlobalVarEntryIndirectVTable))
11329 continue;
11330
11331 // Indirect globals need to use a special name that doesn't match the name
11332 // of the associated host global.
11334 Flags ==
11336 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11337 Flags, CE->getLinkage(), CE->getVarName());
11338 else
11339 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11340 Flags, CE->getLinkage());
11341
11342 } else {
11343 llvm_unreachable("Unsupported entry kind.");
11344 }
11345 }
11346
11347 // Emit requires directive globals to a special entry so the runtime can
11348 // register them when the device image is loaded.
11349 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11350 // entries should be redesigned to better suit this use-case.
11351 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11355 ".requires", /*Size=*/0,
11357 Config.getRequiresFlags());
11358}
11359
11362 unsigned FileID, unsigned Line, unsigned Count) {
11363 raw_svector_ostream OS(Name);
11364 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11365 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11366 if (Count)
11367 OS << "_" << Count;
11368}
11369
11371 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11372 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11374 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11375 EntryInfo.Line, NewCount);
11376}
11377
11380 vfs::FileSystem &VFS,
11381 StringRef ParentName) {
11382 sys::fs::UniqueID ID(0xdeadf17e, 0);
11383 auto FileIDInfo = CallBack();
11384 uint64_t FileID = 0;
11385 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11386 ID = Status->getUniqueID();
11387 FileID = Status->getUniqueID().getFile();
11388 } else {
11389 // If the inode ID could not be determined, create a hash value
11390 // the current file name and use that as an ID.
11391 FileID = hash_value(std::get<0>(FileIDInfo));
11392 }
11393
11394 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11395 std::get<1>(FileIDInfo));
11396}
11397
11399 unsigned Offset = 0;
11400 for (uint64_t Remain =
11401 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11403 !(Remain & 1); Remain = Remain >> 1)
11404 Offset++;
11405 return Offset;
11406}
11407
11410 // Rotate by getFlagMemberOffset() bits.
11411 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11412 << getFlagMemberOffset());
11413}
11414
11417 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11418 // If the entry is PTR_AND_OBJ but has not been marked with the special
11419 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11420 // marked as MEMBER_OF.
11421 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11423 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11426 return;
11427
11428 // Entries with ATTACH are not members-of anything. They are handled
11429 // separately by the runtime after other maps have been handled.
11430 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11432 return;
11433
11434 // Reset the placeholder value to prepare the flag for the assignment of the
11435 // proper MEMBER_OF value.
11436 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11437 Flags |= MemberOfFlag;
11438}
11439
11443 bool IsDeclaration, bool IsExternallyVisible,
11444 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11445 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11446 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11447 std::function<Constant *()> GlobalInitializer,
11448 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11449 // TODO: convert this to utilise the IRBuilder Config rather than
11450 // a passed down argument.
11451 if (OpenMPSIMD)
11452 return nullptr;
11453
11456 CaptureClause ==
11458 Config.hasRequiresUnifiedSharedMemory())) {
11459 SmallString<64> PtrName;
11460 {
11461 raw_svector_ostream OS(PtrName);
11462 OS << MangledName;
11463 if (!IsExternallyVisible)
11464 OS << format("_%x", EntryInfo.FileID);
11465 OS << "_decl_tgt_ref_ptr";
11466 }
11467
11468 Value *Ptr = M.getNamedValue(PtrName);
11469
11470 if (!Ptr) {
11471 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11472 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11473
11474 auto *GV = cast<GlobalVariable>(Ptr);
11475 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11476
11477 if (!Config.isTargetDevice()) {
11478 if (GlobalInitializer)
11479 GV->setInitializer(GlobalInitializer());
11480 else
11481 GV->setInitializer(GlobalValue);
11482 }
11483
11485 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11486 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11487 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11488 }
11489
11490 return cast<Constant>(Ptr);
11491 }
11492
11493 return nullptr;
11494}
11495
11499 bool IsDeclaration, bool IsExternallyVisible,
11500 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11501 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11502 std::vector<Triple> TargetTriple,
11503 std::function<Constant *()> GlobalInitializer,
11504 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11505 Constant *Addr) {
11507 (TargetTriple.empty() && !Config.isTargetDevice()))
11508 return;
11509
11511 StringRef VarName;
11512 int64_t VarSize;
11514
11516 CaptureClause ==
11518 !Config.hasRequiresUnifiedSharedMemory()) {
11520 VarName = MangledName;
11521 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11522
11523 if (!IsDeclaration)
11524 VarSize = divideCeil(
11525 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11526 else
11527 VarSize = 0;
11528 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11529
11530 // This is a workaround carried over from Clang which prevents undesired
11531 // optimisation of internal variables.
11532 if (Config.isTargetDevice() &&
11533 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11534 // Do not create a "ref-variable" if the original is not also available
11535 // on the host.
11536 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11537 return;
11538
11539 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11540
11541 if (!M.getNamedValue(RefName)) {
11542 Constant *AddrRef =
11543 getOrCreateInternalVariable(Addr->getType(), RefName);
11544 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11545 GvAddrRef->setConstant(true);
11546 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11547 GvAddrRef->setInitializer(Addr);
11548 GeneratedRefs.push_back(GvAddrRef);
11549 }
11550 }
11551 } else {
11554 else
11556
11557 if (Config.isTargetDevice()) {
11558 VarName = (Addr) ? Addr->getName() : "";
11559 Addr = nullptr;
11560 } else {
11562 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11563 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11564 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11565 VarName = (Addr) ? Addr->getName() : "";
11566 }
11567 VarSize = M.getDataLayout().getPointerSize();
11569 }
11570
11571 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11572 Flags, Linkage);
11573}
11574
11575/// Loads all the offload entries information from the host IR
11576/// metadata.
11578 // If we are in target mode, load the metadata from the host IR. This code has
11579 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11580
11581 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11582 if (!MD)
11583 return;
11584
11585 for (MDNode *MN : MD->operands()) {
11586 auto &&GetMDInt = [MN](unsigned Idx) {
11587 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11588 return cast<ConstantInt>(V->getValue())->getZExtValue();
11589 };
11590
11591 auto &&GetMDString = [MN](unsigned Idx) {
11592 auto *V = cast<MDString>(MN->getOperand(Idx));
11593 return V->getString();
11594 };
11595
11596 switch (GetMDInt(0)) {
11597 default:
11598 llvm_unreachable("Unexpected metadata!");
11599 break;
11600 case OffloadEntriesInfoManager::OffloadEntryInfo::
11601 OffloadingEntryInfoTargetRegion: {
11602 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11603 /*DeviceID=*/GetMDInt(1),
11604 /*FileID=*/GetMDInt(2),
11605 /*Line=*/GetMDInt(4),
11606 /*Count=*/GetMDInt(5));
11607 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11608 /*Order=*/GetMDInt(6));
11609 break;
11610 }
11611 case OffloadEntriesInfoManager::OffloadEntryInfo::
11612 OffloadingEntryInfoDeviceGlobalVar:
11613 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11614 /*MangledName=*/GetMDString(1),
11616 /*Flags=*/GetMDInt(2)),
11617 /*Order=*/GetMDInt(3));
11618 break;
11619 }
11620 }
11621}
11622
11624 StringRef HostFilePath) {
11625 if (HostFilePath.empty())
11626 return;
11627
11628 auto Buf = VFS.getBufferForFile(HostFilePath);
11629 if (std::error_code Err = Buf.getError()) {
11630 report_fatal_error(("error opening host file from host file path inside of "
11631 "OpenMPIRBuilder: " +
11632 Err.message())
11633 .c_str());
11634 }
11635
11636 LLVMContext Ctx;
11638 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11639 if (std::error_code Err = M.getError()) {
11641 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11642 .c_str());
11643 }
11644
11645 loadOffloadInfoMetadata(*M.get());
11646}
11647
11650 llvm::StringRef Name) {
11651 Builder.restoreIP(Loc.IP);
11652
11653 BasicBlock *CurBB = Builder.GetInsertBlock();
11654 assert(CurBB &&
11655 "expected a valid insertion block for creating an iterator loop");
11656 Function *F = CurBB->getParent();
11657
11658 InsertPointTy SplitIP = Builder.saveIP();
11659 if (SplitIP.getPoint() == CurBB->end())
11660 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
11661 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
11662
11663 BasicBlock *ContBB =
11664 splitBB(SplitIP, /*CreateBranch=*/false,
11665 Builder.getCurrentDebugLocation(), "omp.it.cont");
11666
11667 CanonicalLoopInfo *CLI =
11668 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
11669 /*PreInsertBefore=*/ContBB,
11670 /*PostInsertBefore=*/ContBB, Name);
11671
11672 // Enter loop from original block.
11673 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
11674
11675 // Remove the unconditional branch inserted by createLoopSkeleton in the body
11676 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
11677 T->eraseFromParent();
11678
11679 InsertPointTy BodyIP = CLI->getBodyIP();
11680 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
11681 return Err;
11682
11683 // Body must either fallthrough to the latch or branch directly to it.
11684 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
11685 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
11686 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
11688 "iterator bodygen must terminate the canonical body with an "
11689 "unconditional branch to the loop latch",
11691 }
11692 } else {
11693 // Ensure we end the loop body by jumping to the latch.
11694 Builder.SetInsertPoint(CLI->getBody());
11695 Builder.CreateBr(CLI->getLatch());
11696 }
11697
11698 // Link After -> ContBB
11699 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
11700 if (!CLI->getAfter()->hasTerminator())
11701 Builder.CreateBr(ContBB);
11702
11703 return InsertPointTy{ContBB, ContBB->begin()};
11704}
11705
11706/// Mangle the parameter part of the vector function name according to
11707/// their OpenMP classification. The mangling function is defined in
11708/// section 4.5 of the AAVFABI(2021Q1).
11709static std::string mangleVectorParameters(
11711 SmallString<256> Buffer;
11712 llvm::raw_svector_ostream Out(Buffer);
11713 for (const auto &ParamAttr : ParamAttrs) {
11714 switch (ParamAttr.Kind) {
11716 Out << 'l';
11717 break;
11719 Out << 'R';
11720 break;
11722 Out << 'U';
11723 break;
11725 Out << 'L';
11726 break;
11728 Out << 'u';
11729 break;
11731 Out << 'v';
11732 break;
11733 }
11734 if (ParamAttr.HasVarStride)
11735 Out << "s" << ParamAttr.StrideOrArg;
11736 else if (ParamAttr.Kind ==
11738 ParamAttr.Kind ==
11740 ParamAttr.Kind ==
11742 ParamAttr.Kind ==
11744 // Don't print the step value if it is not present or if it is
11745 // equal to 1.
11746 if (ParamAttr.StrideOrArg < 0)
11747 Out << 'n' << -ParamAttr.StrideOrArg;
11748 else if (ParamAttr.StrideOrArg != 1)
11749 Out << ParamAttr.StrideOrArg;
11750 }
11751
11752 if (!!ParamAttr.Alignment)
11753 Out << 'a' << ParamAttr.Alignment;
11754 }
11755
11756 return std::string(Out.str());
11757}
11758
11760 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
11762 struct ISADataTy {
11763 char ISA;
11764 unsigned VecRegSize;
11765 };
11766 ISADataTy ISAData[] = {
11767 {'b', 128}, // SSE
11768 {'c', 256}, // AVX
11769 {'d', 256}, // AVX2
11770 {'e', 512}, // AVX512
11771 };
11773 switch (Branch) {
11775 Masked.push_back('N');
11776 Masked.push_back('M');
11777 break;
11779 Masked.push_back('N');
11780 break;
11782 Masked.push_back('M');
11783 break;
11784 }
11785 for (char Mask : Masked) {
11786 for (const ISADataTy &Data : ISAData) {
11788 llvm::raw_svector_ostream Out(Buffer);
11789 Out << "_ZGV" << Data.ISA << Mask;
11790 if (!VLENVal) {
11791 assert(NumElts && "Non-zero simdlen/cdtsize expected");
11792 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
11793 } else {
11794 Out << VLENVal;
11795 }
11796 Out << mangleVectorParameters(ParamAttrs);
11797 Out << '_' << Fn->getName();
11798 Fn->addFnAttr(Out.str());
11799 }
11800 }
11801}
11802
11803// Function used to add the attribute. The parameter `VLEN` is templated to
11804// allow the use of `x` when targeting scalable functions for SVE.
11805template <typename T>
11806static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
11807 char ISA, StringRef ParSeq,
11808 StringRef MangledName, bool OutputBecomesInput,
11809 llvm::Function *Fn) {
11810 SmallString<256> Buffer;
11811 llvm::raw_svector_ostream Out(Buffer);
11812 Out << Prefix << ISA << LMask << VLEN;
11813 if (OutputBecomesInput)
11814 Out << 'v';
11815 Out << ParSeq << '_' << MangledName;
11816 Fn->addFnAttr(Out.str());
11817}
11818
11819// Helper function to generate the Advanced SIMD names depending on the value
11820// of the NDS when simdlen is not present.
11821static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
11822 StringRef Prefix, char ISA,
11823 StringRef ParSeq, StringRef MangledName,
11824 bool OutputBecomesInput,
11825 llvm::Function *Fn) {
11826 switch (NDS) {
11827 case 8:
11828 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11829 OutputBecomesInput, Fn);
11830 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
11831 OutputBecomesInput, Fn);
11832 break;
11833 case 16:
11834 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11835 OutputBecomesInput, Fn);
11836 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11837 OutputBecomesInput, Fn);
11838 break;
11839 case 32:
11840 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11841 OutputBecomesInput, Fn);
11842 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11843 OutputBecomesInput, Fn);
11844 break;
11845 case 64:
11846 case 128:
11847 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11848 OutputBecomesInput, Fn);
11849 break;
11850 default:
11851 llvm_unreachable("Scalar type is too wide.");
11852 }
11853}
11854
11855/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
11857 llvm::Function *Fn, unsigned UserVLEN,
11859 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
11860 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
11861
11862 // Sort out parameter sequence.
11863 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
11864 StringRef Prefix = "_ZGV";
11865 StringRef MangledName = Fn->getName();
11866
11867 // Generate simdlen from user input (if any).
11868 if (UserVLEN) {
11869 if (ISA == 's') {
11870 // SVE generates only a masked function.
11871 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11872 OutputBecomesInput, Fn);
11873 return;
11874 }
11875
11876 switch (Branch) {
11878 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11879 OutputBecomesInput, Fn);
11880 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11881 OutputBecomesInput, Fn);
11882 break;
11884 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11885 OutputBecomesInput, Fn);
11886 break;
11888 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11889 OutputBecomesInput, Fn);
11890 break;
11891 }
11892 return;
11893 }
11894
11895 if (ISA == 's') {
11896 // SVE, section 3.4.1, item 1.
11897 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
11898 OutputBecomesInput, Fn);
11899 return;
11900 }
11901
11902 switch (Branch) {
11904 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11905 MangledName, OutputBecomesInput, Fn);
11906 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11907 MangledName, OutputBecomesInput, Fn);
11908 break;
11910 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11911 MangledName, OutputBecomesInput, Fn);
11912 break;
11914 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11915 MangledName, OutputBecomesInput, Fn);
11916 break;
11917 }
11918}
11919
11920//===----------------------------------------------------------------------===//
11921// OffloadEntriesInfoManager
11922//===----------------------------------------------------------------------===//
11923
11925 return OffloadEntriesTargetRegion.empty() &&
11926 OffloadEntriesDeviceGlobalVar.empty();
11927}
11928
11929unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11930 const TargetRegionEntryInfo &EntryInfo) const {
11931 auto It = OffloadEntriesTargetRegionCount.find(
11932 getTargetRegionEntryCountKey(EntryInfo));
11933 if (It == OffloadEntriesTargetRegionCount.end())
11934 return 0;
11935 return It->second;
11936}
11937
11938void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11939 const TargetRegionEntryInfo &EntryInfo) {
11940 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11941 EntryInfo.Count + 1;
11942}
11943
11944/// Initialize target region entry.
11946 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11947 OffloadEntriesTargetRegion[EntryInfo] =
11948 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11950 ++OffloadingEntriesNum;
11951}
11952
11954 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11956 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11957
11958 // Update the EntryInfo with the next available count for this location.
11959 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11960
11961 // If we are emitting code for a target, the entry is already initialized,
11962 // only has to be registered.
11963 if (OMPBuilder->Config.isTargetDevice()) {
11964 // This could happen if the device compilation is invoked standalone.
11965 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11966 return;
11967 }
11968 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11969 Entry.setAddress(Addr);
11970 Entry.setID(ID);
11971 Entry.setFlags(Flags);
11972 } else {
11974 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11975 return;
11976 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11977 "Target region entry already registered!");
11978 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11979 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11980 ++OffloadingEntriesNum;
11981 }
11982 incrementTargetRegionEntryInfoCount(EntryInfo);
11983}
11984
11986 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11987
11988 // Update the EntryInfo with the next available count for this location.
11989 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11990
11991 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11992 if (It == OffloadEntriesTargetRegion.end()) {
11993 return false;
11994 }
11995 // Fail if this entry is already registered.
11996 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11997 return false;
11998 return true;
11999}
12000
12002 const OffloadTargetRegionEntryInfoActTy &Action) {
12003 // Scan all target region entries and perform the provided action.
12004 for (const auto &It : OffloadEntriesTargetRegion) {
12005 Action(It.first, It.second);
12006 }
12007}
12008
12010 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12011 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12012 ++OffloadingEntriesNum;
12013}
12014
12016 StringRef VarName, Constant *Addr, int64_t VarSize,
12018 if (OMPBuilder->Config.isTargetDevice()) {
12019 // This could happen if the device compilation is invoked standalone.
12020 if (!hasDeviceGlobalVarEntryInfo(VarName))
12021 return;
12022 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12023 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12024 if (Entry.getVarSize() == 0) {
12025 Entry.setVarSize(VarSize);
12026 Entry.setLinkage(Linkage);
12027 }
12028 return;
12029 }
12030 Entry.setVarSize(VarSize);
12031 Entry.setLinkage(Linkage);
12032 Entry.setAddress(Addr);
12033 } else {
12034 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12035 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12036 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12037 "Entry not initialized!");
12038 if (Entry.getVarSize() == 0) {
12039 Entry.setVarSize(VarSize);
12040 Entry.setLinkage(Linkage);
12041 }
12042 return;
12043 }
12045 Flags ==
12047 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12048 Addr, VarSize, Flags, Linkage,
12049 VarName.str());
12050 else
12051 OffloadEntriesDeviceGlobalVar.try_emplace(
12052 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12053 ++OffloadingEntriesNum;
12054 }
12055}
12056
12059 // Scan all target region entries and perform the provided action.
12060 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12061 Action(E.getKey(), E.getValue());
12062}
12063
12064//===----------------------------------------------------------------------===//
12065// CanonicalLoopInfo
12066//===----------------------------------------------------------------------===//
12067
12068void CanonicalLoopInfo::collectControlBlocks(
12070 // We only count those BBs as control block for which we do not need to
12071 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12072 // flow. For consistency, this also means we do not add the Body block, which
12073 // is just the entry to the body code.
12074 BBs.reserve(BBs.size() + 6);
12075 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12076}
12077
12079 assert(isValid() && "Requires a valid canonical loop");
12080 for (BasicBlock *Pred : predecessors(Header)) {
12081 if (Pred != Latch)
12082 return Pred;
12083 }
12084 llvm_unreachable("Missing preheader");
12085}
12086
12087void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12088 assert(isValid() && "Requires a valid canonical loop");
12089
12090 Instruction *CmpI = &getCond()->front();
12091 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12092 CmpI->setOperand(1, TripCount);
12093
12094#ifndef NDEBUG
12095 assertOK();
12096#endif
12097}
12098
12099void CanonicalLoopInfo::mapIndVar(
12100 llvm::function_ref<Value *(Instruction *)> Updater) {
12101 assert(isValid() && "Requires a valid canonical loop");
12102
12103 Instruction *OldIV = getIndVar();
12104
12105 // Record all uses excluding those introduced by the updater. Uses by the
12106 // CanonicalLoopInfo itself to keep track of the number of iterations are
12107 // excluded.
12108 SmallVector<Use *> ReplacableUses;
12109 for (Use &U : OldIV->uses()) {
12110 auto *User = dyn_cast<Instruction>(U.getUser());
12111 if (!User)
12112 continue;
12113 if (User->getParent() == getCond())
12114 continue;
12115 if (User->getParent() == getLatch())
12116 continue;
12117 ReplacableUses.push_back(&U);
12118 }
12119
12120 // Run the updater that may introduce new uses
12121 Value *NewIV = Updater(OldIV);
12122
12123 // Replace the old uses with the value returned by the updater.
12124 for (Use *U : ReplacableUses)
12125 U->set(NewIV);
12126
12127#ifndef NDEBUG
12128 assertOK();
12129#endif
12130}
12131
12133#ifndef NDEBUG
12134 // No constraints if this object currently does not describe a loop.
12135 if (!isValid())
12136 return;
12137
12138 BasicBlock *Preheader = getPreheader();
12139 BasicBlock *Body = getBody();
12140 BasicBlock *After = getAfter();
12141
12142 // Verify standard control-flow we use for OpenMP loops.
12143 assert(Preheader);
12144 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12145 "Preheader must terminate with unconditional branch");
12146 assert(Preheader->getSingleSuccessor() == Header &&
12147 "Preheader must jump to header");
12148
12149 assert(Header);
12150 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12151 "Header must terminate with unconditional branch");
12152 assert(Header->getSingleSuccessor() == Cond &&
12153 "Header must jump to exiting block");
12154
12155 assert(Cond);
12156 assert(Cond->getSinglePredecessor() == Header &&
12157 "Exiting block only reachable from header");
12158
12159 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12160 "Exiting block must terminate with conditional branch");
12161 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12162 "Exiting block's first successor jump to the body");
12163 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12164 "Exiting block's second successor must exit the loop");
12165
12166 assert(Body);
12167 assert(Body->getSinglePredecessor() == Cond &&
12168 "Body only reachable from exiting block");
12169 assert(!isa<PHINode>(Body->front()));
12170
12171 assert(Latch);
12172 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12173 "Latch must terminate with unconditional branch");
12174 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12175 // TODO: To support simple redirecting of the end of the body code that has
12176 // multiple; introduce another auxiliary basic block like preheader and after.
12177 assert(Latch->getSinglePredecessor() != nullptr);
12178 assert(!isa<PHINode>(Latch->front()));
12179
12180 assert(Exit);
12181 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12182 "Exit block must terminate with unconditional branch");
12183 assert(Exit->getSingleSuccessor() == After &&
12184 "Exit block must jump to after block");
12185
12186 assert(After);
12187 assert(After->getSinglePredecessor() == Exit &&
12188 "After block only reachable from exit block");
12189 assert(After->empty() || !isa<PHINode>(After->front()));
12190
12191 Instruction *IndVar = getIndVar();
12192 assert(IndVar && "Canonical induction variable not found?");
12193 assert(isa<IntegerType>(IndVar->getType()) &&
12194 "Induction variable must be an integer");
12195 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12196 "Induction variable must be a PHI in the loop header");
12197 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12198 assert(
12199 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12200 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12201
12202 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12203 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12204 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12205 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12206 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12207 ->isOne());
12208
12209 Value *TripCount = getTripCount();
12210 assert(TripCount && "Loop trip count not found?");
12211 assert(IndVar->getType() == TripCount->getType() &&
12212 "Trip count and induction variable must have the same type");
12213
12214 auto *CmpI = cast<CmpInst>(&Cond->front());
12215 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12216 "Exit condition must be a signed less-than comparison");
12217 assert(CmpI->getOperand(0) == IndVar &&
12218 "Exit condition must compare the induction variable");
12219 assert(CmpI->getOperand(1) == TripCount &&
12220 "Exit condition must compare with the trip count");
12221#endif
12222}
12223
12225 Header = nullptr;
12226 Cond = nullptr;
12227 Latch = nullptr;
12228 Exit = nullptr;
12229}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:990
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, AffinityData Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1099
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1161
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1177
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:167
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:87
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:375
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...